summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/ah4.c13
-rw-r--r--net/ipv4/arp.c8
-rw-r--r--net/ipv4/devinet.c33
-rw-r--r--net/ipv4/esp4.c27
-rw-r--r--net/ipv4/fib_frontend.c13
-rw-r--r--net/ipv4/fib_lookup.h3
-rw-r--r--net/ipv4/fib_semantics.c46
-rw-r--r--net/ipv4/fib_trie.c35
-rw-r--r--net/ipv4/icmp.c10
-rw-r--r--net/ipv4/igmp.c28
-rw-r--r--net/ipv4/inet_connection_sock.c1
-rw-r--r--net/ipv4/ip_gre.c18
-rw-r--r--net/ipv4/ip_tunnel.c6
-rw-r--r--net/ipv4/ip_tunnel_core.c17
-rw-r--r--net/ipv4/ip_vti.c9
-rw-r--r--net/ipv4/ipconfig.c3
-rw-r--r--net/ipv4/ipip.c9
-rw-r--r--net/ipv4/ipmr.c300
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c10
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c5
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/route.c223
-rw-r--r--net/ipv4/syncookies.c17
-rw-r--r--net/ipv4/sysctl_net_ipv4.c67
-rw-r--r--net/ipv4/tcp.c85
-rw-r--r--net/ipv4/tcp_cong.c1
-rw-r--r--net/ipv4/tcp_input.c68
-rw-r--r--net/ipv4/tcp_ipv4.c92
-rw-r--r--net/ipv4/tcp_minisocks.c4
-rw-r--r--net/ipv4/tcp_output.c19
-rw-r--r--net/ipv4/tcp_rate.c1
-rw-r--r--net/ipv4/tcp_timer.c26
-rw-r--r--net/ipv4/tcp_ulp.c135
-rw-r--r--net/ipv4/udp.c107
-rw-r--r--net/ipv4/xfrm4_mode_beet.c3
37 files changed, 1082 insertions, 365 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f83de23a30e7..afcb435adfbe 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
- tcp_rate.o tcp_recovery.o \
+ tcp_rate.o tcp_recovery.o tcp_ulp.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f3dad1661343..58925b6597de 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1043,7 +1043,7 @@ static struct inet_protosw inetsw_array[] =
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
- .ops = &inet_dgram_ops,
+ .ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
},
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 22377c8ff14b..37db44f60718 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,5 +1,6 @@
#define pr_fmt(fmt) "IPsec: " fmt
+#include <crypto/algapi.h>
#include <crypto/hash.h>
#include <linux/err.h>
#include <linux/module.h>
@@ -220,7 +221,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
sg_init_table(sg, nfrags + sglists);
- skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+ err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+ if (unlikely(err < 0))
+ goto out_free;
if (x->props.flags & XFRM_STATE_ESN) {
/* Attach seqhi sg right after packet payload */
@@ -277,7 +280,7 @@ static void ah_input_done(struct crypto_async_request *base, int err)
auth_data = ah_tmp_auth(work_iph, ihl);
icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
- err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
if (err)
goto out;
@@ -393,7 +396,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
skb_push(skb, ihl);
sg_init_table(sg, nfrags + sglists);
- skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+ err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+ if (unlikely(err < 0))
+ goto out_free;
if (x->props.flags & XFRM_STATE_ESN) {
/* Attach seqhi sg right after packet payload */
@@ -413,7 +418,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
goto out_free;
}
- err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
if (err)
goto out_free;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index ae96e6f3e0cb..8b52179ddc6e 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -539,7 +539,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
skb_reserve(skb, hlen);
skb_reset_network_header(skb);
- arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev));
+ arp = skb_put(skb, arp_hdr_len(dev));
skb->dev = dev;
skb->protocol = htons(ETH_P_ARP);
if (!src_hw)
@@ -863,8 +863,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+ addr_type = -1;
if (n || IN_DEV_ARP_ACCEPT(in_dev)) {
- addr_type = -1;
is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
sip, tip, sha, tha);
}
@@ -1113,13 +1113,17 @@ static int arp_invalidate(struct net_device *dev, __be32 ip)
{
struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
int err = -ENXIO;
+ struct neigh_table *tbl = &arp_tbl;
if (neigh) {
if (neigh->nud_state & ~NUD_NOARP)
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_ADMIN, 0);
+ write_lock_bh(&tbl->lock);
neigh_release(neigh);
+ neigh_remove_one(neigh, tbl);
+ write_unlock_bh(&tbl->lock);
}
return err;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index df14815a3b8c..a7dd088d5fc9 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -176,6 +176,7 @@ EXPORT_SYMBOL(__ip_dev_find);
static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
+static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain);
static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
int destroy);
#ifdef CONFIG_SYSCTL
@@ -441,6 +442,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
{
struct in_device *in_dev = ifa->ifa_dev;
struct in_ifaddr *ifa1, **ifap, **last_primary;
+ struct in_validator_info ivi;
+ int ret;
ASSERT_RTNL();
@@ -471,6 +474,23 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
}
}
+ /* Allow any devices that wish to register ifaddr validtors to weigh
+ * in now, before changes are committed. The rntl lock is serializing
+ * access here, so the state should not change between a validator call
+ * and a final notify on commit. This isn't invoked on promotion under
+ * the assumption that validators are checking the address itself, and
+ * not the flags.
+ */
+ ivi.ivi_addr = ifa->ifa_address;
+ ivi.ivi_dev = ifa->ifa_dev;
+ ret = blocking_notifier_call_chain(&inetaddr_validator_chain,
+ NETDEV_UP, &ivi);
+ ret = notifier_to_errno(ret);
+ if (ret) {
+ inet_free_ifa(ifa);
+ return ret;
+ }
+
if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
prandom_seed((__force u32) ifa->ifa_local);
ifap = last_primary;
@@ -1356,6 +1376,19 @@ int unregister_inetaddr_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_inetaddr_notifier);
+int register_inetaddr_validator_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&inetaddr_validator_chain, nb);
+}
+EXPORT_SYMBOL(register_inetaddr_validator_notifier);
+
+int unregister_inetaddr_validator_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&inetaddr_validator_chain,
+ nb);
+}
+EXPORT_SYMBOL(unregister_inetaddr_validator_notifier);
+
/* Rename ifa_labels for a device name change. Make some effort to preserve
* existing alias numbering and to create unique labels if possible.
*/
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 65cc02bd82bc..1f18b4650253 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -248,6 +248,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
u8 *tail;
u8 *vaddr;
int nfrags;
+ int esph_offset;
struct page *page;
struct sk_buff *trailer;
int tailen = esp->tailen;
@@ -313,11 +314,13 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
}
cow:
+ esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
+
nfrags = skb_cow_data(skb, tailen, &trailer);
if (nfrags < 0)
goto out;
tail = skb_tail_pointer(trailer);
- esp->esph = ip_esp_hdr(skb);
+ esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
skip_cow:
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
@@ -374,9 +377,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
esp->esph = esph;
sg_init_table(sg, esp->nfrags);
- skb_to_sgvec(skb, sg,
- (unsigned char *)esph - skb->data,
- assoclen + ivlen + esp->clen + alen);
+ err = skb_to_sgvec(skb, sg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + esp->clen + alen);
+ if (unlikely(err < 0))
+ goto error;
if (!esp->inplace) {
int allocsize;
@@ -400,9 +405,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
spin_unlock_bh(&x->lock);
sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
- skb_to_sgvec(skb, dsg,
- (unsigned char *)esph - skb->data,
- assoclen + ivlen + esp->clen + alen);
+ err = skb_to_sgvec(skb, dsg,
+ (unsigned char *)esph - skb->data,
+ assoclen + ivlen + esp->clen + alen);
+ if (unlikely(err < 0))
+ goto error;
}
if ((x->props.flags & XFRM_STATE_ESN))
@@ -602,7 +609,7 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
* decryption.
*/
if ((x->props.flags & XFRM_STATE_ESN)) {
- esph = (void *)skb_push(skb, 4);
+ esph = skb_push(skb, 4);
*seqhi = esph->spi;
esph->spi = esph->seq_no;
esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
@@ -687,7 +694,9 @@ skip_cow:
esp_input_set_header(skb, seqhi);
sg_init_table(sg, nfrags);
- skb_to_sgvec(skb, sg, 0, skb->len);
+ err = skb_to_sgvec(skb, sg, 0, skb->len);
+ if (unlikely(err < 0))
+ goto out;
skb->ip_summed = CHECKSUM_NONE;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 14d2f7bd7c76..4e678fa892dd 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -588,7 +588,8 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (cmd == SIOCDELRT) {
tb = fib_get_table(net, cfg.fc_table);
if (tb)
- err = fib_table_delete(net, tb, &cfg);
+ err = fib_table_delete(net, tb, &cfg,
+ NULL);
else
err = -ESRCH;
} else {
@@ -684,7 +685,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
break;
case RTA_MULTIPATH:
err = lwtunnel_valid_encap_type_attr(nla_data(attr),
- nla_len(attr));
+ nla_len(attr),
+ extack);
if (err < 0)
goto errout;
cfg->fc_mp = nla_data(attr);
@@ -701,7 +703,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
break;
case RTA_ENCAP_TYPE:
cfg->fc_encap_type = nla_get_u16(attr);
- err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
+ err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
+ extack);
if (err < 0)
goto errout;
break;
@@ -732,7 +735,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = fib_table_delete(net, tb, &cfg);
+ err = fib_table_delete(net, tb, &cfg, extack);
errout:
return err;
}
@@ -851,7 +854,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
if (cmd == RTM_NEWROUTE)
fib_table_insert(net, tb, &cfg, NULL);
else
- fib_table_delete(net, tb, &cfg);
+ fib_table_delete(net, tb, &cfg, NULL);
}
void fib_add_ifaddr(struct in_ifaddr *ifa)
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 2704e08545da..769ab87ebc4b 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -30,7 +30,8 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
void fib_release_info(struct fib_info *);
struct fib_info *fib_create_info(struct fib_config *cfg,
struct netlink_ext_ack *extack);
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
+ struct netlink_ext_ack *extack);
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
unsigned int);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4852e183afe0..ff47ea1408fe 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -152,7 +152,8 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp)
* free_fib_info_rcu()
*/
- dst_free(&rt->dst);
+ dst_dev_put(&rt->dst);
+ dst_release_immediate(&rt->dst);
}
static void free_nh_exceptions(struct fib_nh *nh)
@@ -194,8 +195,10 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
struct rtable *rt;
rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
- if (rt)
- dst_free(&rt->dst);
+ if (rt) {
+ dst_dev_put(&rt->dst);
+ dst_release_immediate(&rt->dst);
+ }
}
free_percpu(rtp);
}
@@ -204,6 +207,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
+ struct dst_metrics *m;
change_nexthops(fi) {
if (nexthop_nh->nh_dev)
@@ -214,8 +218,9 @@ static void free_fib_info_rcu(struct rcu_head *head)
rt_fibinfo_free(&nexthop_nh->nh_rth_input);
} endfor_nexthops(fi);
- if (fi->fib_metrics != (u32 *) dst_default_metrics)
- kfree(fi->fib_metrics);
+ m = fi->fib_metrics;
+ if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
+ kfree(m);
kfree(fi);
}
@@ -530,7 +535,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
ret = lwtunnel_build_state(nla_get_u16(
nla_entype),
nla, AF_INET, cfg,
- &lwtstate);
+ &lwtstate, extack);
if (ret)
goto errout;
nexthop_nh->nh_lwtstate =
@@ -612,7 +617,8 @@ static inline void fib_add_weight(struct fib_info *fi,
static int fib_encap_match(u16 encap_type,
struct nlattr *encap,
const struct fib_nh *nh,
- const struct fib_config *cfg)
+ const struct fib_config *cfg,
+ struct netlink_ext_ack *extack)
{
struct lwtunnel_state *lwtstate;
int ret, result = 0;
@@ -620,8 +626,8 @@ static int fib_encap_match(u16 encap_type,
if (encap_type == LWTUNNEL_ENCAP_NONE)
return 0;
- ret = lwtunnel_build_state(encap_type, encap,
- AF_INET, cfg, &lwtstate);
+ ret = lwtunnel_build_state(encap_type, encap, AF_INET,
+ cfg, &lwtstate, extack);
if (!ret) {
result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
lwtstate_free(lwtstate);
@@ -630,7 +636,8 @@ static int fib_encap_match(u16 encap_type,
return result;
}
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
+int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
+ struct netlink_ext_ack *extack)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
struct rtnexthop *rtnh;
@@ -642,9 +649,9 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
if (cfg->fc_oif || cfg->fc_gw) {
if (cfg->fc_encap) {
- if (fib_encap_match(cfg->fc_encap_type,
- cfg->fc_encap, fi->fib_nh, cfg))
- return 1;
+ if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
+ fi->fib_nh, cfg, extack))
+ return 1;
}
if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
(!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
@@ -1010,11 +1017,11 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
val = 255;
if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
return -EINVAL;
- fi->fib_metrics[type - 1] = val;
+ fi->fib_metrics->metrics[type - 1] = val;
}
if (ecn_ca)
- fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+ fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
return 0;
}
@@ -1078,11 +1085,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto failure;
fib_info_cnt++;
if (cfg->fc_mx) {
- fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
if (!fi->fib_metrics)
goto failure;
+ atomic_set(&fi->fib_metrics->refcnt, 1);
} else
- fi->fib_metrics = (u32 *) dst_default_metrics;
+ fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
@@ -1145,7 +1153,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
}
err = lwtunnel_build_state(cfg->fc_encap_type,
cfg->fc_encap, AF_INET, cfg,
- &lwtstate);
+ &lwtstate, extack);
if (err)
goto failure;
@@ -1313,7 +1321,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
if (fi->fib_priority &&
nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
goto nla_put_failure;
- if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+ if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0)
goto nla_put_failure;
if (fi->fib_prefsrc &&
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 6d0f6c79d9aa..d56659e97a6e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1099,6 +1099,22 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
return 0;
}
+static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
+{
+ if (plen > KEYLENGTH) {
+ NL_SET_ERR_MSG(extack, "Invalid prefix length");
+ return false;
+ }
+
+ if ((plen < KEYLENGTH) && (key << plen)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid prefix for given prefix length");
+ return false;
+ }
+
+ return true;
+}
+
/* Caller must hold RTNL. */
int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
@@ -1115,16 +1131,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
u32 key;
int err;
- if (plen > KEYLENGTH)
- return -EINVAL;
-
key = ntohl(cfg->fc_dst);
- pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
-
- if ((plen < KEYLENGTH) && (key << plen))
+ if (!fib_valid_key_len(key, plen, extack))
return -EINVAL;
+ pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
+
fi = fib_create_info(cfg, extack);
if (IS_ERR(fi)) {
err = PTR_ERR(fi);
@@ -1452,6 +1465,7 @@ found:
if (!(fib_flags & FIB_LOOKUP_NOREF))
atomic_inc(&fi->fib_clntref);
+ res->prefix = htonl(n->key);
res->prefixlen = KEYLENGTH - fa->fa_slen;
res->nh_sel = nhsel;
res->type = fa->fa_type;
@@ -1507,7 +1521,7 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
/* Caller must hold RTNL. */
int fib_table_delete(struct net *net, struct fib_table *tb,
- struct fib_config *cfg)
+ struct fib_config *cfg, struct netlink_ext_ack *extack)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *fa_to_delete;
@@ -1517,12 +1531,9 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
u8 tos = cfg->fc_tos;
u32 key;
- if (plen > KEYLENGTH)
- return -EINVAL;
-
key = ntohl(cfg->fc_dst);
- if ((plen < KEYLENGTH) && (key << plen))
+ if (!fib_valid_key_len(key, plen, extack))
return -EINVAL;
l = fib_find_node(t, &tp, key);
@@ -1551,7 +1562,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
fi->fib_prefsrc == cfg->fc_prefsrc) &&
(!cfg->fc_protocol ||
fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi) == 0) {
+ fib_nh_match(cfg, fi, extack) == 0) {
fa_to_delete = fa;
break;
}
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 43318b5f5647..c2be26b98b5f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -489,7 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
- rt = __ip_route_output_key_hash(net, fl4, skb_in);
+ rt = ip_route_output_key_hash(net, fl4, skb_in);
if (IS_ERR(rt))
return rt;
@@ -657,8 +657,12 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
/* Needed by both icmp_global_allow and icmp_xmit_lock */
local_bh_disable();
- /* Check global sysctl_icmp_msgs_per_sec ratelimit */
- if (!icmpv4_global_allow(net, type, code))
+ /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
+ * incoming dev is loopback. If outgoing dev change to not be
+ * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
+ */
+ if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
+ !icmpv4_global_allow(net, type, code))
goto out_bh_enable;
sk = icmp_xmit_lock(net);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 44fd86de2823..c4032302d7cd 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -414,7 +414,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
skb = igmpv3_newpack(dev, dev->mtu);
if (!skb)
return NULL;
- pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
+ pgr = skb_put(skb, sizeof(struct igmpv3_grec));
pgr->grec_type = type;
pgr->grec_auxwords = 0;
pgr->grec_nsrcs = 0;
@@ -508,7 +508,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
}
if (!skb)
return NULL;
- psrc = (__be32 *)skb_put(skb, sizeof(__be32));
+ psrc = skb_put(skb, sizeof(__be32));
*psrc = psf->sf_inaddr;
scount++; stotal++;
if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
@@ -742,7 +742,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
((u8 *)&iph[1])[2] = 0;
((u8 *)&iph[1])[3] = 0;
- ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ ih = skb_put(skb, sizeof(struct igmphdr));
ih->type = type;
ih->code = 0;
ih->csum = 0;
@@ -1112,6 +1112,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
if (!pmc)
return;
+ spin_lock_init(&pmc->lock);
spin_lock_bh(&im->lock);
pmc->interface = im->interface;
in_dev_hold(in_dev);
@@ -2071,21 +2072,26 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
- struct ip_sf_list *psf, *nextpsf;
+ struct ip_sf_list *psf, *nextpsf, *tomb, *sources;
- for (psf = pmc->tomb; psf; psf = nextpsf) {
+ spin_lock_bh(&pmc->lock);
+ tomb = pmc->tomb;
+ pmc->tomb = NULL;
+ sources = pmc->sources;
+ pmc->sources = NULL;
+ pmc->sfmode = MCAST_EXCLUDE;
+ pmc->sfcount[MCAST_INCLUDE] = 0;
+ pmc->sfcount[MCAST_EXCLUDE] = 1;
+ spin_unlock_bh(&pmc->lock);
+
+ for (psf = tomb; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
- pmc->tomb = NULL;
- for (psf = pmc->sources; psf; psf = nextpsf) {
+ for (psf = sources; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
- pmc->sources = NULL;
- pmc->sfmode = MCAST_EXCLUDE;
- pmc->sfcount[MCAST_INCLUDE] = 0;
- pmc->sfcount[MCAST_EXCLUDE] = 1;
}
/* Join a multicast group
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 82dec8825d28..a3fa1a5b6d98 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -790,7 +790,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
- newsk->sk_write_space = sk_stream_write_space;
/* listeners have SOCK_RCU_FREE, not the children */
sock_reset_flag(newsk, SOCK_RCU_FREE);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index e90c80a548ad..7a7829e839c2 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -592,7 +592,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
struct iphdr *iph;
struct gre_base_hdr *greh;
- iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
+ iph = skb_push(skb, t->hlen + sizeof(*iph));
greh = (struct gre_base_hdr *)(iph+1);
greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
greh->protocol = htons(type);
@@ -779,7 +779,8 @@ static struct pernet_operations ipgre_net_ops = {
.size = sizeof(struct ip_tunnel_net),
};
-static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
__be16 flags;
@@ -802,7 +803,8 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
return 0;
}
-static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
+static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
__be32 daddr;
@@ -823,7 +825,7 @@ static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
}
out:
- return ipgre_tunnel_validate(tb, data);
+ return ipgre_tunnel_validate(tb, data, extack);
}
static int ipgre_netlink_parms(struct net_device *dev,
@@ -957,7 +959,8 @@ static void ipgre_tap_setup(struct net_device *dev)
}
static int ipgre_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
@@ -979,7 +982,8 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
}
static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
- struct nlattr *data[])
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm p;
@@ -1155,7 +1159,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
t = netdev_priv(dev);
t->collect_md = true;
- err = ipgre_newlink(net, dev, tb, NULL);
+ err = ipgre_newlink(net, dev, tb, NULL, NULL);
if (err < 0) {
free_netdev(dev);
return ERR_PTR(err);
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index b878ecbc0608..129d1a3616f8 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -446,6 +446,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
return 0;
drop:
+ if (tun_dst)
+ dst_release((struct dst_entry *)tun_dst);
kfree_skb(skb);
return 0;
}
@@ -967,7 +969,6 @@ static void ip_tunnel_dev_free(struct net_device *dev)
gro_cells_destroy(&tunnel->gro_cells);
dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
- free_netdev(dev);
}
void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
@@ -1155,7 +1156,8 @@ int ip_tunnel_init(struct net_device *dev)
struct iphdr *iph = &tunnel->parms.iph;
int err;
- dev->destructor = ip_tunnel_dev_free;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = ip_tunnel_dev_free;
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index baf196eaf1d8..2f39479be92f 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -134,10 +134,12 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
struct metadata_dst *res;
struct ip_tunnel_info *dst, *src;
- if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
+ if (!md || md->type != METADATA_IP_TUNNEL ||
+ md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
+
return NULL;
- res = metadata_dst_alloc(0, flags);
+ res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags);
if (!res)
return NULL;
@@ -228,14 +230,16 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
static int ip_tun_build_state(struct nlattr *attr,
unsigned int family, const void *cfg,
- struct lwtunnel_state **ts)
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
{
struct ip_tunnel_info *tun_info;
struct lwtunnel_state *new_state;
struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
int err;
- err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, NULL);
+ err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy,
+ extack);
if (err < 0)
return err;
@@ -325,7 +329,8 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
static int ip6_tun_build_state(struct nlattr *attr,
unsigned int family, const void *cfg,
- struct lwtunnel_state **ts)
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
{
struct ip_tunnel_info *tun_info;
struct lwtunnel_state *new_state;
@@ -333,7 +338,7 @@ static int ip6_tun_build_state(struct nlattr *attr,
int err;
err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy,
- NULL);
+ extack);
if (err < 0)
return err;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 4ec9affb2252..0192c255e508 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -465,7 +465,8 @@ static struct pernet_operations vti_net_ops = {
.size = sizeof(struct ip_tunnel_net),
};
-static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
return 0;
}
@@ -503,7 +504,8 @@ static void vti_netlink_parms(struct nlattr *data[],
}
static int vti_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
struct ip_tunnel_parm parms;
__u32 fwmark = 0;
@@ -513,7 +515,8 @@ static int vti_newlink(struct net *src_net, struct net_device *dev,
}
static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
- struct nlattr *data[])
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
__u32 fwmark = t->fwmark;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index c3b12b1c7162..4c5dfe6bd34d 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -813,8 +813,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
if (!skb)
return;
skb_reserve(skb, hlen);
- b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
- memset(b, 0, sizeof(struct bootp_pkt));
+ b = skb_put_zero(skb, sizeof(struct bootp_pkt));
/* Construct IP header */
skb_reset_network_header(skb);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 1e441c6f2160..fb1ad22b5e29 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -375,7 +375,8 @@ static int ipip_tunnel_init(struct net_device *dev)
return ip_tunnel_init(dev);
}
-static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
u8 proto;
@@ -469,7 +470,8 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[],
}
static int ipip_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[])
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm p;
@@ -488,7 +490,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev,
}
static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
- struct nlattr *data[])
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_parm p;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 551de4d023a8..bb909f1d7537 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -101,14 +101,15 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id);
static void ipmr_free_table(struct mr_table *mrt);
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *cache,
- int local);
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *cache, int local);
static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert);
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mfc_cache *c, struct rtmsg *rtm);
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
+static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
static void mroute_clean_tables(struct mr_table *mrt, bool all);
static void ipmr_expire_process(unsigned long arg);
@@ -501,7 +502,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
dev->flags = IFF_NOARP;
dev->netdev_ops = &reg_vif_netdev_ops;
- dev->destructor = free_netdev;
+ dev->needs_free_netdev = true;
dev->features |= NETIF_F_NETNS_LOCAL;
}
@@ -669,7 +670,8 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
if (ip_hdr(skb)->version == 0) {
- struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+ struct nlmsghdr *nlh = skb_pull(skb,
+ sizeof(struct iphdr));
nlh->nlmsg_type = NLMSG_ERROR;
nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
skb_trim(skb, nlh->nlmsg_len);
@@ -972,7 +974,8 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
/* Play the pending entries through our router */
while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
if (ip_hdr(skb)->version == 0) {
- struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+ struct nlmsghdr *nlh = skb_pull(skb,
+ sizeof(struct iphdr));
if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
nlh->nlmsg_len = skb_tail_pointer(skb) -
@@ -988,13 +991,12 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else {
- ip_mr_forward(net, mrt, skb, c, 0);
+ ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
}
}
}
-/* Bounce a cache query up to mrouted. We could use netlink for this but mrouted
- * expects the following bizarre scheme.
+/* Bounce a cache query up to mrouted and netlink.
*
* Called under mrt_lock.
*/
@@ -1044,7 +1046,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
msg->im_vif = vifi;
skb_dst_set(skb, dst_clone(skb_dst(pkt)));
/* Add our header */
- igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ igmp = skb_put(skb, sizeof(struct igmphdr));
igmp->type = assert;
msg->im_msgtype = assert;
igmp->code = 0;
@@ -1060,6 +1062,8 @@ static int ipmr_cache_report(struct mr_table *mrt,
return -EINVAL;
}
+ igmpmsg_netlink_event(mrt, skb);
+
/* Deliver to mrouted */
ret = sock_queue_rcv_skb(mroute_sk, skb);
rcu_read_unlock();
@@ -1073,7 +1077,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
/* Queue a packet for resolution. It gets locked cache entry! */
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net_device *dev)
{
const struct iphdr *iph = ip_hdr(skb);
struct mfc_cache *c;
@@ -1130,6 +1134,10 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
kfree_skb(skb);
err = -ENOBUFS;
} else {
+ if (dev) {
+ skb->dev = dev;
+ skb->skb_iif = dev->ifindex;
+ }
skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
err = 0;
}
@@ -1828,10 +1836,10 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
/* "local" means that we should preserve one skb (for local delivery) */
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc_cache *cache,
- int local)
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *cache, int local)
{
- int true_vifi = ipmr_find_vif(mrt, skb->dev);
+ int true_vifi = ipmr_find_vif(mrt, dev);
int psend = -1;
int vif, ct;
@@ -1853,13 +1861,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
}
/* Wrong interface: drop packet and (maybe) send PIM assert. */
- if (mrt->vif_table[vif].dev != skb->dev) {
- struct net_device *mdev;
-
- mdev = l3mdev_master_dev_rcu(mrt->vif_table[vif].dev);
- if (mdev == skb->dev)
- goto forward;
-
+ if (mrt->vif_table[vif].dev != dev) {
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -2053,7 +2055,7 @@ int ip_mr_input(struct sk_buff *skb)
read_lock(&mrt_lock);
vif = ipmr_find_vif(mrt, dev);
if (vif >= 0) {
- int err2 = ipmr_cache_unresolved(mrt, vif, skb);
+ int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev);
read_unlock(&mrt_lock);
return err2;
@@ -2064,7 +2066,7 @@ int ip_mr_input(struct sk_buff *skb)
}
read_lock(&mrt_lock);
- ip_mr_forward(net, mrt, skb, cache, local);
+ ip_mr_forward(net, mrt, dev, skb, cache, local);
read_unlock(&mrt_lock);
if (local)
@@ -2238,7 +2240,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
iph->saddr = saddr;
iph->daddr = daddr;
iph->version = 0;
- err = ipmr_cache_unresolved(mrt, vif, skb2);
+ err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
read_unlock(&mrt_lock);
rcu_read_unlock();
return err;
@@ -2341,6 +2343,130 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
}
+static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
+{
+ size_t len =
+ NLMSG_ALIGN(sizeof(struct rtgenmsg))
+ + nla_total_size(1) /* IPMRA_CREPORT_MSGTYPE */
+ + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */
+ + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */
+ + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */
+ /* IPMRA_CREPORT_PKT */
+ + nla_total_size(payloadlen)
+ ;
+
+ return len;
+}
+
+static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
+{
+ struct net *net = read_pnet(&mrt->net);
+ struct nlmsghdr *nlh;
+ struct rtgenmsg *rtgenm;
+ struct igmpmsg *msg;
+ struct sk_buff *skb;
+ struct nlattr *nla;
+ int payloadlen;
+
+ payloadlen = pkt->len - sizeof(struct igmpmsg);
+ msg = (struct igmpmsg *)skb_network_header(pkt);
+
+ skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC);
+ if (!skb)
+ goto errout;
+
+ nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
+ sizeof(struct rtgenmsg), 0);
+ if (!nlh)
+ goto errout;
+ rtgenm = nlmsg_data(nlh);
+ rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
+ if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
+ nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) ||
+ nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
+ msg->im_src.s_addr) ||
+ nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
+ msg->im_dst.s_addr))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
+ if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg),
+ nla_data(nla), payloadlen))
+ goto nla_put_failure;
+
+ nlmsg_end(skb, nlh);
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC);
+ return;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+errout:
+ kfree_skb(skb);
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS);
+}
+
+static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[RTA_MAX + 1];
+ struct sk_buff *skb = NULL;
+ struct mfc_cache *cache;
+ struct mr_table *mrt;
+ struct rtmsg *rtm;
+ __be32 src, grp;
+ u32 tableid;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err < 0)
+ goto errout;
+
+ rtm = nlmsg_data(nlh);
+
+ src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
+ grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
+ tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0;
+
+ mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT);
+ if (IS_ERR(mrt)) {
+ err = PTR_ERR(mrt);
+ goto errout_free;
+ }
+
+ /* entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ cache = ipmr_cache_find(mrt, src, grp);
+ rcu_read_unlock();
+ if (!cache) {
+ err = -ENOENT;
+ goto errout_free;
+ }
+
+ skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout_free;
+ }
+
+ err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, cache,
+ RTM_NEWROUTE, 0);
+ if (err < 0)
+ goto errout_free;
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+errout:
+ return err;
+
+errout_free:
+ kfree_skb(skb);
+ goto errout;
+}
+
static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -2528,6 +2654,129 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh,
return ipmr_mfc_delete(tbl, &mfcc, parent);
}
+static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
+{
+ u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len);
+
+ if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) ||
+ nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) ||
+ nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM,
+ mrt->mroute_reg_vif_num) ||
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
+ mrt->mroute_do_assert) ||
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
+ return false;
+
+ return true;
+}
+
+static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
+{
+ struct nlattr *vif_nest;
+ struct vif_device *vif;
+
+ /* if the VIF doesn't exist just continue */
+ if (!VIF_EXISTS(mrt, vifid))
+ return true;
+
+ vif = &mrt->vif_table[vifid];
+ vif_nest = nla_nest_start(skb, IPMRA_VIF);
+ if (!vif_nest)
+ return false;
+ if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) ||
+ nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
+ nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
+ nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
+ IPMRA_VIFA_PAD) ||
+ nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out,
+ IPMRA_VIFA_PAD) ||
+ nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in,
+ IPMRA_VIFA_PAD) ||
+ nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out,
+ IPMRA_VIFA_PAD) ||
+ nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) ||
+ nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) {
+ nla_nest_cancel(skb, vif_nest);
+ return false;
+ }
+ nla_nest_end(skb, vif_nest);
+
+ return true;
+}
+
+static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlmsghdr *nlh = NULL;
+ unsigned int t = 0, s_t;
+ unsigned int e = 0, s_e;
+ struct mr_table *mrt;
+
+ s_t = cb->args[0];
+ s_e = cb->args[1];
+
+ ipmr_for_each_table(mrt, net) {
+ struct nlattr *vifs, *af;
+ struct ifinfomsg *hdr;
+ u32 i;
+
+ if (t < s_t)
+ goto skip_table;
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_NEWLINK,
+ sizeof(*hdr), NLM_F_MULTI);
+ if (!nlh)
+ break;
+
+ hdr = nlmsg_data(nlh);
+ memset(hdr, 0, sizeof(*hdr));
+ hdr->ifi_family = RTNL_FAMILY_IPMR;
+
+ af = nla_nest_start(skb, IFLA_AF_SPEC);
+ if (!af) {
+ nlmsg_cancel(skb, nlh);
+ goto out;
+ }
+
+ if (!ipmr_fill_table(mrt, skb)) {
+ nlmsg_cancel(skb, nlh);
+ goto out;
+ }
+
+ vifs = nla_nest_start(skb, IPMRA_TABLE_VIFS);
+ if (!vifs) {
+ nla_nest_end(skb, af);
+ nlmsg_end(skb, nlh);
+ goto out;
+ }
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (e < s_e)
+ goto skip_entry;
+ if (!ipmr_fill_vif(mrt, i, skb)) {
+ nla_nest_end(skb, vifs);
+ nla_nest_end(skb, af);
+ nlmsg_end(skb, nlh);
+ goto out;
+ }
+skip_entry:
+ e++;
+ }
+ s_e = 0;
+ e = 0;
+ nla_nest_end(skb, vifs);
+ nla_nest_end(skb, af);
+ nlmsg_end(skb, nlh);
+skip_table:
+ t++;
+ }
+
+out:
+ cb->args[1] = e;
+ cb->args[0] = t;
+
+ return skb->len;
+}
+
#ifdef CONFIG_PROC_FS
/* The /proc interfaces to multicast routing :
* /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
@@ -2865,11 +3114,14 @@ int __init ip_mr_init(void)
}
#endif
rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
- NULL, ipmr_rtm_dumproute, NULL);
+ ipmr_rtm_getroute, ipmr_rtm_dumproute, NULL);
rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
ipmr_rtm_route, NULL, NULL);
rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
ipmr_rtm_route, NULL, NULL);
+
+ rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK,
+ NULL, ipmr_rtm_dumplink, NULL);
return 0;
#ifdef CONFIG_IP_PIMSM_V2
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index af2b69b6895f..f1528f7175a8 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -24,7 +24,7 @@ synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
struct iphdr *iph;
skb_reset_network_header(skb);
- iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
+ iph = skb_put(skb, sizeof(*iph));
iph->version = 4;
iph->ihl = sizeof(*iph) / 4;
iph->tos = 0;
@@ -91,7 +91,7 @@ synproxy_send_client_synack(struct net *net,
niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
skb_reset_transport_header(nskb);
- nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->dest;
nth->dest = th->source;
nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
@@ -133,7 +133,7 @@ synproxy_send_server_syn(struct net *net,
niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
skb_reset_transport_header(nskb);
- nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->source;
nth->dest = th->dest;
nth->seq = htonl(recv_seq - 1);
@@ -178,7 +178,7 @@ synproxy_send_server_ack(struct net *net,
niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
skb_reset_transport_header(nskb);
- nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->dest;
nth->dest = th->source;
nth->seq = htonl(ntohl(th->ack_seq));
@@ -216,7 +216,7 @@ synproxy_send_client_ack(struct net *net,
niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
skb_reset_transport_header(nskb);
- nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
+ nth = skb_put(nskb, tcp_hdr_size);
nth->source = th->source;
nth->dest = th->dest;
nth->seq = htonl(ntohl(th->seq) + 1);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 6f8d9e5e062b..eeacbdaf7cdf 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -51,7 +51,7 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
struct iphdr *niph, *oiph = ip_hdr(oldskb);
skb_reset_network_header(nskb);
- niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
+ niph = skb_put(nskb, sizeof(struct iphdr));
niph->version = 4;
niph->ihl = sizeof(struct iphdr) / 4;
niph->tos = 0;
@@ -76,8 +76,7 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
struct tcphdr *tcph;
skb_reset_transport_header(nskb);
- tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
- memset(tcph, 0, sizeof(*tcph));
+ tcph = skb_put_zero(nskb, sizeof(struct tcphdr));
tcph->source = oth->dest;
tcph->dest = oth->source;
tcph->doff = sizeof(struct tcphdr) / 4;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index fa44e752a9a3..43eb6567b3a0 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -250,6 +250,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+ SNMP_MIB_ITEM("TCPMemoryPressuresChrono", LINUX_MIB_TCPMEMORYPRESSURESCHRONO),
SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD),
SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD),
SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 655d9eebe43e..c816cd53f7fc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -114,6 +114,8 @@
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
+#include "fib_lookup.h"
+
#define RT_FL_TOS(oldflp4) \
((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
@@ -587,11 +589,6 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
build_sk_flow_key(fl4, sk);
}
-static inline void rt_free(struct rtable *rt)
-{
- call_rcu(&rt->dst.rcu_head, dst_rcu_free);
-}
-
static DEFINE_SPINLOCK(fnhe_lock);
static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
@@ -601,12 +598,14 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
rt = rcu_dereference(fnhe->fnhe_rth_input);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
- rt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
rt = rcu_dereference(fnhe->fnhe_rth_output);
if (rt) {
RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
- rt_free(rt);
+ dst_dev_put(&rt->dst);
+ dst_release(&rt->dst);
}
}
@@ -1300,7 +1299,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
}
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
- __be32 daddr)
+ __be32 daddr, const bool do_cache)
{
bool ret = false;
@@ -1329,10 +1328,13 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
- if (!(rt->dst.flags & DST_NOCACHE)) {
+ if (do_cache) {
+ dst_hold(&rt->dst);
rcu_assign_pointer(*porig, rt);
- if (orig)
- rt_free(orig);
+ if (orig) {
+ dst_dev_put(&orig->dst);
+ dst_release(&orig->dst);
+ }
ret = true;
}
@@ -1355,12 +1357,20 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
}
orig = *p;
+ /* hold dst before doing cmpxchg() to avoid race condition
+ * on this dst
+ */
+ dst_hold(&rt->dst);
prev = cmpxchg(p, orig, rt);
if (prev == orig) {
- if (orig)
- rt_free(orig);
- } else
+ if (orig) {
+ dst_dev_put(&orig->dst);
+ dst_release(&orig->dst);
+ }
+ } else {
+ dst_release(&rt->dst);
ret = false;
+ }
return ret;
}
@@ -1385,8 +1395,12 @@ static void rt_add_uncached_list(struct rtable *rt)
static void ipv4_dst_destroy(struct dst_entry *dst)
{
+ struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
struct rtable *rt = (struct rtable *) dst;
+ if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
+ kfree(p);
+
if (!list_empty(&rt->rt_uncached)) {
struct uncached_list *ul = rt->rt_uncached_list;
@@ -1427,7 +1441,8 @@ static bool rt_cache_valid(const struct rtable *rt)
static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
const struct fib_result *res,
struct fib_nh_exception *fnhe,
- struct fib_info *fi, u16 type, u32 itag)
+ struct fib_info *fi, u16 type, u32 itag,
+ const bool do_cache)
{
bool cached = false;
@@ -1438,14 +1453,18 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
rt->rt_gateway = nh->nh_gw;
rt->rt_uses_gateway = 1;
}
- dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+ dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
+ if (fi->fib_metrics != &dst_default_metrics) {
+ rt->dst._metrics |= DST_METRICS_REFCOUNTED;
+ atomic_inc(&fi->fib_metrics->refcnt);
+ }
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
if (unlikely(fnhe))
- cached = rt_bind_exception(rt, fnhe, daddr);
- else if (!(rt->dst.flags & DST_NOCACHE))
+ cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
+ else if (do_cache)
cached = rt_cache_route(nh, rt);
if (unlikely(!cached)) {
/* Routes we intend to cache in nexthop exception or
@@ -1453,7 +1472,6 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
* However, if we are unsuccessful at storing this
* route into the cache we really need to set it.
*/
- rt->dst.flags |= DST_NOCACHE;
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
rt_add_uncached_list(rt);
@@ -1476,7 +1494,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
struct rtable *rt;
rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
+ (will_cache ? 0 : DST_HOST) |
(nopolicy ? DST_NOPOLICY : 0) |
(noxfrm ? DST_NOXFRM : 0));
@@ -1720,7 +1738,8 @@ rt_cache:
rth->dst.input = ip_forward;
- rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
+ do_cache);
set_lwt_redirect(rth);
skb_dst_set(skb, &rth->dst);
out:
@@ -1852,9 +1871,9 @@ static int ip_mkroute_input(struct sk_buff *skb,
*/
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev)
+ u8 tos, struct net_device *dev,
+ struct fib_result *res)
{
- struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
struct ip_tunnel_info *tun_info;
struct flowi4 fl4;
@@ -1884,8 +1903,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
goto martian_source;
- res.fi = NULL;
- res.table = NULL;
+ res->fi = NULL;
+ res->table = NULL;
if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
goto brd_input;
@@ -1921,17 +1940,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.daddr = daddr;
fl4.saddr = saddr;
fl4.flowi4_uid = sock_net_uid(net, NULL);
- err = fib_lookup(net, &fl4, &res, 0);
+ err = fib_lookup(net, &fl4, res, 0);
if (err != 0) {
if (!IN_DEV_FORWARD(in_dev))
err = -EHOSTUNREACH;
goto no_route;
}
- if (res.type == RTN_BROADCAST)
+ if (res->type == RTN_BROADCAST)
goto brd_input;
- if (res.type == RTN_LOCAL) {
+ if (res->type == RTN_LOCAL) {
err = fib_validate_source(skb, saddr, daddr, tos,
0, dev, in_dev, &itag);
if (err < 0)
@@ -1943,10 +1962,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
err = -EHOSTUNREACH;
goto no_route;
}
- if (res.type != RTN_UNICAST)
+ if (res->type != RTN_UNICAST)
goto martian_destination;
- err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos);
+ err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
out: return err;
brd_input:
@@ -1960,14 +1979,14 @@ brd_input:
goto martian_source;
}
flags |= RTCF_BROADCAST;
- res.type = RTN_BROADCAST;
+ res->type = RTN_BROADCAST;
RT_CACHE_STAT_INC(in_brd);
local_input:
do_cache = false;
- if (res.fi) {
+ if (res->fi) {
if (!itag) {
- rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
err = 0;
@@ -1978,7 +1997,7 @@ local_input:
}
rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
- flags | RTCF_LOCAL, res.type,
+ flags | RTCF_LOCAL, res->type,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
if (!rth)
goto e_nobufs;
@@ -1988,18 +2007,18 @@ local_input:
rth->dst.tclassid = itag;
#endif
rth->rt_is_input = 1;
- if (res.table)
- rth->rt_table_id = res.table->tb_id;
+ if (res->table)
+ rth->rt_table_id = res->table->tb_id;
RT_CACHE_STAT_INC(in_slow_tot);
- if (res.type == RTN_UNREACHABLE) {
+ if (res->type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
rth->dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
if (do_cache) {
- struct fib_nh *nh = &FIB_RES_NH(res);
+ struct fib_nh *nh = &FIB_RES_NH(*res);
rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
@@ -2008,10 +2027,8 @@ local_input:
rth->dst.input = lwtunnel_input;
}
- if (unlikely(!rt_cache_route(nh, rth))) {
- rth->dst.flags |= DST_NOCACHE;
+ if (unlikely(!rt_cache_route(nh, rth)))
rt_add_uncached_list(rth);
- }
}
skb_dst_set(skb, &rth->dst);
err = 0;
@@ -2019,9 +2036,9 @@ local_input:
no_route:
RT_CACHE_STAT_INC(in_no_route);
- res.type = RTN_UNREACHABLE;
- res.fi = NULL;
- res.table = NULL;
+ res->type = RTN_UNREACHABLE;
+ res->fi = NULL;
+ res->table = NULL;
goto local_input;
/*
@@ -2051,11 +2068,22 @@ martian_source:
int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
- int res;
+ struct fib_result res;
+ int err;
tos &= IPTOS_RT_MASK;
rcu_read_lock();
+ err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(ip_route_input_noref);
+
+/* called with rcu_read_lock held */
+int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ u8 tos, struct net_device *dev, struct fib_result *res)
+{
/* Multicast recognition logic is moved from route cache to here.
The problem was that too many Ethernet cards have broken/missing
hardware multicast filters :-( As result the host on multicasting
@@ -2070,6 +2098,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
int our = 0;
+ int err = -EINVAL;
if (in_dev)
our = ip_check_mc_rcu(in_dev, daddr, saddr,
@@ -2085,7 +2114,6 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
ip_hdr(skb)->protocol);
}
- res = -EINVAL;
if (our
#ifdef CONFIG_IP_MROUTE
||
@@ -2093,17 +2121,14 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
IN_DEV_MFORWARD(in_dev))
#endif
) {
- res = ip_route_input_mc(skb, daddr, saddr,
+ err = ip_route_input_mc(skb, daddr, saddr,
tos, dev, our);
}
- rcu_read_unlock();
- return res;
+ return err;
}
- res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
- rcu_read_unlock();
- return res;
+
+ return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
}
-EXPORT_SYMBOL(ip_route_input_noref);
/* called with rcu_read_lock() */
static struct rtable *__mkroute_output(const struct fib_result *res,
@@ -2199,10 +2224,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
rth = rcu_dereference(*prth);
rt_cache:
- if (rt_cache_valid(rth)) {
- dst_hold(&rth->dst);
+ if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
return rth;
- }
}
add:
@@ -2236,7 +2259,7 @@ add:
#endif
}
- rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
+ rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
set_lwt_redirect(rth);
return rth;
@@ -2246,29 +2269,40 @@ add:
* Major route resolver routine.
*/
-struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
- const struct sk_buff *skb)
+struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+ const struct sk_buff *skb)
{
- struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
- unsigned int flags = 0;
struct fib_result res;
struct rtable *rth;
- int orig_oif;
- int err = -ENETUNREACH;
res.tclassid = 0;
res.fi = NULL;
res.table = NULL;
- orig_oif = fl4->flowi4_oif;
-
fl4->flowi4_iif = LOOPBACK_IFINDEX;
fl4->flowi4_tos = tos & IPTOS_RT_MASK;
fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
rcu_read_lock();
+ rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
+ rcu_read_unlock();
+
+ return rth;
+}
+EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
+
+struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
+ struct fib_result *res,
+ const struct sk_buff *skb)
+{
+ struct net_device *dev_out = NULL;
+ int orig_oif = fl4->flowi4_oif;
+ unsigned int flags = 0;
+ struct rtable *rth;
+ int err = -ENETUNREACH;
+
if (fl4->saddr) {
rth = ERR_PTR(-EINVAL);
if (ipv4_is_multicast(fl4->saddr) ||
@@ -2354,15 +2388,15 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
dev_out = net->loopback_dev;
fl4->flowi4_oif = LOOPBACK_IFINDEX;
- res.type = RTN_LOCAL;
+ res->type = RTN_LOCAL;
flags |= RTCF_LOCAL;
goto make_route;
}
- err = fib_lookup(net, fl4, &res, 0);
+ err = fib_lookup(net, fl4, res, 0);
if (err) {
- res.fi = NULL;
- res.table = NULL;
+ res->fi = NULL;
+ res->table = NULL;
if (fl4->flowi4_oif &&
(ipv4_is_multicast(fl4->daddr) ||
!netif_index_is_l3_master(net, fl4->flowi4_oif))) {
@@ -2387,43 +2421,41 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
if (fl4->saddr == 0)
fl4->saddr = inet_select_addr(dev_out, 0,
RT_SCOPE_LINK);
- res.type = RTN_UNICAST;
+ res->type = RTN_UNICAST;
goto make_route;
}
rth = ERR_PTR(err);
goto out;
}
- if (res.type == RTN_LOCAL) {
+ if (res->type == RTN_LOCAL) {
if (!fl4->saddr) {
- if (res.fi->fib_prefsrc)
- fl4->saddr = res.fi->fib_prefsrc;
+ if (res->fi->fib_prefsrc)
+ fl4->saddr = res->fi->fib_prefsrc;
else
fl4->saddr = fl4->daddr;
}
/* L3 master device is the loopback for that domain */
- dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(res)) ? :
+ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
net->loopback_dev;
fl4->flowi4_oif = dev_out->ifindex;
flags |= RTCF_LOCAL;
goto make_route;
}
- fib_select_path(net, &res, fl4, skb);
+ fib_select_path(net, res, fl4, skb);
- dev_out = FIB_RES_DEV(res);
+ dev_out = FIB_RES_DEV(*res);
fl4->flowi4_oif = dev_out->ifindex;
make_route:
- rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
+ rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
out:
- rcu_read_unlock();
return rth;
}
-EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
{
@@ -2477,7 +2509,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->input = dst_discard;
new->output = dst_discard_out;
- new->dev = ort->dst.dev;
+ new->dev = net->loopback_dev;
if (new->dev)
dev_hold(new->dev);
@@ -2492,7 +2524,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_uses_gateway = ort->rt_uses_gateway;
INIT_LIST_HEAD(&rt->rt_uncached);
- dst_free(new);
}
dst_release(dst_orig);
@@ -2517,9 +2548,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);
+/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
- u32 seq, int event)
+ u32 seq)
{
struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
@@ -2528,7 +2560,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
u32 error;
u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0);
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
if (!nlh)
return -EMSGSIZE;
@@ -2636,6 +2668,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct net *net = sock_net(in_skb->sk);
struct rtmsg *rtm;
struct nlattr *tb[RTA_MAX+1];
+ struct fib_result res = {};
struct rtable *rt = NULL;
struct flowi4 fl4;
__be32 dst = 0;
@@ -2692,10 +2725,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
+ rcu_read_lock();
+
if (iif) {
struct net_device *dev;
- dev = __dev_get_by_index(net, iif);
+ dev = dev_get_by_index_rcu(net, iif);
if (!dev) {
err = -ENODEV;
goto errout_free;
@@ -2704,14 +2739,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
skb->protocol = htons(ETH_P_IP);
skb->dev = dev;
skb->mark = mark;
- err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+ err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
+ dev, &res);
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
err = -rt->dst.error;
} else {
- rt = ip_route_output_key(net, &fl4);
-
+ rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
err = 0;
if (IS_ERR(rt))
err = PTR_ERR(rt);
@@ -2727,17 +2762,25 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
table_id = rt->rt_table_id;
- err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
- NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
- RTM_NEWROUTE);
+ if (rtm->rtm_flags & RTM_F_FIB_MATCH)
+ err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
+ rt->rt_type, res.prefix, res.prefixlen,
+ fl4.flowi4_tos, res.fi, 0);
+ else
+ err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+ NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
if (err < 0)
goto errout_free;
+ rcu_read_unlock();
+
err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
return err;
errout_free:
+ rcu_read_unlock();
kfree_skb(skb);
goto errout;
}
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 6426250a58ea..7835bb4a1fab 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -232,7 +232,8 @@ EXPORT_SYMBOL(tcp_get_cookie_sock);
* return false if we decode a tcp option that is disabled
* on the host.
*/
-bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt)
+bool cookie_timestamp_decode(const struct net *net,
+ struct tcp_options_received *tcp_opt)
{
/* echoed timestamp, lowest bits contain options */
u32 options = tcp_opt->rcv_tsecr;
@@ -242,12 +243,12 @@ bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt)
return true;
}
- if (!sysctl_tcp_timestamps)
+ if (!net->ipv4.sysctl_tcp_timestamps)
return false;
tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0;
- if (tcp_opt->sack_ok && !sysctl_tcp_sack)
+ if (tcp_opt->sack_ok && !net->ipv4.sysctl_tcp_sack)
return false;
if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK)
@@ -256,7 +257,7 @@ bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt)
tcp_opt->wscale_ok = 1;
tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK;
- return sysctl_tcp_window_scaling != 0;
+ return net->ipv4.sysctl_tcp_window_scaling != 0;
}
EXPORT_SYMBOL(cookie_timestamp_decode);
@@ -312,14 +313,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(skb, &tcp_opt, 0, NULL);
+ tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
- tsoff = secure_tcp_ts_off(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
+ tsoff = secure_tcp_ts_off(sock_net(sk),
+ ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr);
tcp_opt.rcv_tsecr -= tsoff;
}
- if (!cookie_timestamp_decode(&tcp_opt))
+ if (!cookie_timestamp_decode(sock_net(sk), &tcp_opt))
goto out;
ret = NULL;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 86957e9cd6c6..9bf809726066 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -360,32 +360,30 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (write && ret == 0)
tcp_fastopen_active_timeout_reset();
+
+ return ret;
+}
+
+static int proc_tcp_available_ulp(struct ctl_table *ctl,
+ int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, };
+ int ret;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+ if (!tbl.data)
+ return -ENOMEM;
+ tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ kfree(tbl.data);
+
return ret;
}
static struct ctl_table ipv4_table[] = {
{
- .procname = "tcp_timestamps",
- .data = &sysctl_tcp_timestamps,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_window_scaling",
- .data = &sysctl_tcp_window_scaling,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_sack",
- .data = &sysctl_tcp_sack,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_retrans_collapse",
.data = &sysctl_tcp_retrans_collapse,
.maxlen = sizeof(int),
@@ -707,6 +705,12 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_ms_jiffies,
},
{
+ .procname = "tcp_available_ulp",
+ .maxlen = TCP_ULP_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_tcp_available_ulp,
+ },
+ {
.procname = "icmp_msgs_per_sec",
.data = &sysctl_icmp_msgs_per_sec,
.maxlen = sizeof(int),
@@ -1116,6 +1120,27 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .procname = "tcp_sack",
+ .data = &init_net.ipv4.sysctl_tcp_sack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_window_scaling",
+ .data = &init_net.ipv4.sysctl_tcp_window_scaling,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_timestamps",
+ .data = &init_net.ipv4.sysctl_tcp_timestamps,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index aaf663a1e571..4c88d20d91d4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -320,17 +320,36 @@ struct tcp_splice_state {
* All the __sk_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
-int tcp_memory_pressure __read_mostly;
-EXPORT_SYMBOL(tcp_memory_pressure);
+unsigned long tcp_memory_pressure __read_mostly;
+EXPORT_SYMBOL_GPL(tcp_memory_pressure);
void tcp_enter_memory_pressure(struct sock *sk)
{
- if (!tcp_memory_pressure) {
+ unsigned long val;
+
+ if (tcp_memory_pressure)
+ return;
+ val = jiffies;
+
+ if (!val)
+ val--;
+ if (!cmpxchg(&tcp_memory_pressure, 0, val))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
- tcp_memory_pressure = 1;
- }
}
-EXPORT_SYMBOL(tcp_enter_memory_pressure);
+EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
+
+void tcp_leave_memory_pressure(struct sock *sk)
+{
+ unsigned long val;
+
+ if (!tcp_memory_pressure)
+ return;
+ val = xchg(&tcp_memory_pressure, 0);
+ if (val)
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
+ jiffies_to_msecs(jiffies - val));
+}
+EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
/* Convert seconds to retransmits based on initial and max timeout */
static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
@@ -882,8 +901,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
-static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
int mss_now, size_goal;
@@ -1013,6 +1032,7 @@ out_err:
}
return sk_stream_error(sk, flags, err);
}
+EXPORT_SYMBOL_GPL(do_tcp_sendpages);
int tcp_sendpage(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
@@ -1084,9 +1104,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
+ struct sockaddr *uaddr = msg->msg_name;
int err, flags;
- if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
+ if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
+ (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
+ uaddr->sa_family == AF_UNSPEC))
return -EOPNOTSUPP;
if (tp->fastopen_req)
return -EALREADY; /* Another Fast Open is in progress */
@@ -1108,7 +1131,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
}
}
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
- err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
+ err = __inet_stream_connect(sk->sk_socket, uaddr,
msg->msg_namelen, flags, 1);
/* fastopen_req could already be freed in __inet_stream_connect
* if the connection times out or gets rst
@@ -2378,9 +2401,10 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l
return 0;
}
-static int tcp_repair_options_est(struct tcp_sock *tp,
+static int tcp_repair_options_est(struct sock *sk,
struct tcp_repair_opt __user *optbuf, unsigned int len)
{
+ struct tcp_sock *tp = tcp_sk(sk);
struct tcp_repair_opt opt;
while (len >= sizeof(opt)) {
@@ -2393,6 +2417,7 @@ static int tcp_repair_options_est(struct tcp_sock *tp,
switch (opt.opt_code) {
case TCPOPT_MSS:
tp->rx_opt.mss_clamp = opt.opt_val;
+ tcp_mtup_init(sk);
break;
case TCPOPT_WINDOW:
{
@@ -2458,6 +2483,24 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
release_sock(sk);
return err;
}
+ case TCP_ULP: {
+ char name[TCP_ULP_NAME_MAX];
+
+ if (optlen < 1)
+ return -EINVAL;
+
+ val = strncpy_from_user(name, optval,
+ min_t(long, TCP_ULP_NAME_MAX - 1,
+ optlen));
+ if (val < 0)
+ return -EFAULT;
+ name[val] = 0;
+
+ lock_sock(sk);
+ err = tcp_set_ulp(sk, name);
+ release_sock(sk);
+ return err;
+ }
default:
/* fallthru */
break;
@@ -2553,7 +2596,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (!tp->repair)
err = -EINVAL;
else if (sk->sk_state == TCP_ESTABLISHED)
- err = tcp_repair_options_est(tp,
+ err = tcp_repair_options_est(sk,
(struct tcp_repair_opt __user *)optval,
optlen);
else
@@ -2671,8 +2714,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
#ifdef CONFIG_TCP_MD5SIG
case TCP_MD5SIG:
+ case TCP_MD5SIG_EXT:
/* Read the IP->Key mappings from userspace */
- err = tp->af_specific->md5_parse(sk, optval, optlen);
+ err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
break;
#endif
case TCP_USER_TIMEOUT:
@@ -3014,6 +3058,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return -EFAULT;
return 0;
+ case TCP_ULP:
+ if (get_user(len, optlen))
+ return -EFAULT;
+ len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
+ if (!icsk->icsk_ulp_ops) {
+ if (put_user(0, optlen))
+ return -EFAULT;
+ return 0;
+ }
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
+ return -EFAULT;
+ return 0;
+
case TCP_THIN_LINEAR_TIMEOUTS:
val = tp->thin_lto;
break;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 6e3c512054a6..324c9bcc5456 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -180,6 +180,7 @@ void tcp_init_congestion_control(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_sk(sk)->prior_ssthresh = 0;
if (icsk->icsk_ca_ops->init)
icsk->icsk_ca_ops->init(sk);
if (tcp_ca_needs_ecn(sk))
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2fa55f57ac06..2ab7e2fa9bb9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -76,9 +76,6 @@
#include <asm/unaligned.h>
#include <linux/errqueue.h>
-int sysctl_tcp_timestamps __read_mostly = 1;
-int sysctl_tcp_window_scaling __read_mostly = 1;
-int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly;
int sysctl_tcp_max_reordering __read_mostly = 300;
int sysctl_tcp_dsack __read_mostly = 1;
@@ -112,6 +109,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
+#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -2920,9 +2918,9 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
rtt_us ? : jiffies_to_usecs(1));
}
-static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
- long seq_rtt_us, long sack_rtt_us,
- long ca_rtt_us)
+static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
+ long seq_rtt_us, long sack_rtt_us,
+ long ca_rtt_us, struct rate_sample *rs)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2947,6 +2945,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
seq_rtt_us = ca_rtt_us = delta_us;
}
+ rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0)
return false;
@@ -2966,12 +2965,13 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
{
+ struct rate_sample rs;
long rtt_us = -1L;
if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack);
- tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs);
}
@@ -3176,9 +3176,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt);
ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt);
}
- sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
- ca_rtt_us);
+ ca_rtt_us, sack->rate);
if (flag & FLAG_ACKED) {
tcp_rearm_rto(sk);
@@ -3214,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (icsk->icsk_ca_ops->pkts_acked) {
struct ack_sample sample = { .pkts_acked = pkts_acked,
- .rtt_us = ca_rtt_us,
+ .rtt_us = sack->rate->rtt_us,
.in_flight = last_in_flight };
icsk->icsk_ca_ops->pkts_acked(sk, &sample);
@@ -3568,7 +3567,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (before(ack, prior_snd_una)) {
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - tp->max_window)) {
- tcp_send_challenge_ack(sk, skb);
+ if (!(flag & FLAG_NO_CHALLENGE_ACK))
+ tcp_send_challenge_ack(sk, skb);
return -1;
}
goto old_ack;
@@ -3721,7 +3721,8 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
* But, this can also be called on packets in the established flow when
* the fast version below fails.
*/
-void tcp_parse_options(const struct sk_buff *skb,
+void tcp_parse_options(const struct net *net,
+ const struct sk_buff *skb,
struct tcp_options_received *opt_rx, int estab,
struct tcp_fastopen_cookie *foc)
{
@@ -3762,7 +3763,7 @@ void tcp_parse_options(const struct sk_buff *skb,
break;
case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW && th->syn &&
- !estab && sysctl_tcp_window_scaling) {
+ !estab && net->ipv4.sysctl_tcp_window_scaling) {
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > TCP_MAX_WSCALE) {
@@ -3778,7 +3779,7 @@ void tcp_parse_options(const struct sk_buff *skb,
case TCPOPT_TIMESTAMP:
if ((opsize == TCPOLEN_TIMESTAMP) &&
((estab && opt_rx->tstamp_ok) ||
- (!estab && sysctl_tcp_timestamps))) {
+ (!estab && net->ipv4.sysctl_tcp_timestamps))) {
opt_rx->saw_tstamp = 1;
opt_rx->rcv_tsval = get_unaligned_be32(ptr);
opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
@@ -3786,7 +3787,7 @@ void tcp_parse_options(const struct sk_buff *skb,
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM && th->syn &&
- !estab && sysctl_tcp_sack) {
+ !estab && net->ipv4.sysctl_tcp_sack) {
opt_rx->sack_ok = TCP_SACK_SEEN;
tcp_sack_reset(opt_rx);
}
@@ -3855,7 +3856,8 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
/* Fast parse options. This hopes to only see timestamps.
* If it is wrong it falls back on tcp_parse_options().
*/
-static bool tcp_fast_parse_options(const struct sk_buff *skb,
+static bool tcp_fast_parse_options(const struct net *net,
+ const struct sk_buff *skb,
const struct tcphdr *th, struct tcp_sock *tp)
{
/* In the spirit of fast parsing, compare doff directly to constant
@@ -3870,7 +3872,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
return true;
}
- tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
+ tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
@@ -5231,7 +5233,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
bool rst_seq_match = false;
/* RFC1323: H1. Apply PAWS check first. */
- if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+ if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
+ tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5602,7 +5605,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
tcp_clear_options(&opt);
opt.user_mss = opt.mss_clamp = 0;
- tcp_parse_options(synack, &opt, 0, NULL);
+ tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL);
mss = opt.mss_clamp;
}
@@ -5656,7 +5659,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
int saved_clamp = tp->rx_opt.mss_clamp;
bool fastopen_fail;
- tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
+ tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
tp->rx_opt.rcv_tsecr -= tp->tsoffset;
@@ -5951,13 +5954,17 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
/* step 5: check the ACK field */
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
- FLAG_UPDATE_TS_RECENT) > 0;
+ FLAG_UPDATE_TS_RECENT |
+ FLAG_NO_CHALLENGE_ACK) > 0;
+ if (!acceptable) {
+ if (sk->sk_state == TCP_SYN_RECV)
+ return 1; /* send one RST */
+ tcp_send_challenge_ack(sk, skb);
+ goto discard;
+ }
switch (sk->sk_state) {
case TCP_SYN_RECV:
- if (!acceptable)
- return 1;
-
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
@@ -6026,14 +6033,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
* our SYNACK so stop the SYNACK timer.
*/
if (req) {
- /* Return RST if ack_seq is invalid.
- * Note that RFC793 only says to generate a
- * DUPACK for it but for TCP Fast Open it seems
- * better to treat this case like TCP_SYN_RECV
- * above.
- */
- if (!acceptable)
- return 1;
/* We no longer need the request sock. */
reqsk_fastopen_remove(sk, req, false);
tcp_rearm_rto(sk);
@@ -6333,7 +6332,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
tmp_opt.user_mss = tp->rx_opt.user_mss;
- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
+ tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
+ want_cookie ? NULL : &foc);
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
@@ -6351,7 +6351,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_free;
if (tmp_opt.tstamp_ok)
- tcp_rsk(req)->ts_off = af_ops->init_ts_off(skb);
+ tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
if (!want_cookie && !isn) {
/* Kill the following clause, if you dislike this way. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 191b2f78b19d..d774bcd9a54b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -80,6 +80,7 @@
#include <linux/stddef.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/inetdevice.h>
#include <crypto/hash.h>
#include <linux/scatterlist.h>
@@ -102,10 +103,9 @@ static u32 tcp_v4_init_seq(const struct sk_buff *skb)
tcp_hdr(skb)->source);
}
-static u32 tcp_v4_init_ts_off(const struct sk_buff *skb)
+static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
{
- return secure_tcp_ts_off(ip_hdr(skb)->daddr,
- ip_hdr(skb)->saddr);
+ return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -242,7 +242,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr,
inet->inet_sport,
usin->sin_port);
- tp->tsoffset = secure_tcp_ts_off(inet->inet_saddr,
+ tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
+ inet->inet_saddr,
inet->inet_daddr);
}
@@ -906,6 +907,48 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
+ const struct tcp_md5sig_info *md5sig;
+ __be32 mask;
+ struct tcp_md5sig_key *best_match = NULL;
+ bool match;
+
+ /* caller either holds rcu_read_lock() or socket lock */
+ md5sig = rcu_dereference_check(tp->md5sig_info,
+ lockdep_sock_is_held(sk));
+ if (!md5sig)
+ return NULL;
+
+ hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ if (key->family != family)
+ continue;
+
+ if (family == AF_INET) {
+ mask = inet_make_mask(key->prefixlen);
+ match = (key->addr.a4.s_addr & mask) ==
+ (addr->a4.s_addr & mask);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
+ key->prefixlen);
+#endif
+ } else {
+ match = false;
+ }
+
+ if (match && (!best_match ||
+ key->prefixlen > best_match->prefixlen))
+ best_match = key;
+ }
+ return best_match;
+}
+EXPORT_SYMBOL(tcp_md5_do_lookup);
+
+struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
+ const union tcp_md5_addr *addr,
+ int family, u8 prefixlen)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
unsigned int size = sizeof(struct in_addr);
const struct tcp_md5sig_info *md5sig;
@@ -921,12 +964,12 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
hlist_for_each_entry_rcu(key, &md5sig->head, node) {
if (key->family != family)
continue;
- if (!memcmp(&key->addr, addr, size))
+ if (!memcmp(&key->addr, addr, size) &&
+ key->prefixlen == prefixlen)
return key;
}
return NULL;
}
-EXPORT_SYMBOL(tcp_md5_do_lookup);
struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
@@ -940,14 +983,15 @@ EXPORT_SYMBOL(tcp_v4_md5_lookup);
/* This can be called on a newly created socket, from other files */
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
- int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
+ int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
+ gfp_t gfp)
{
/* Add Key to the list */
struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *md5sig;
- key = tcp_md5_do_lookup(sk, addr, family);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
if (key) {
/* Pre-existing entry - just update that one. */
memcpy(key->key, newkey, newkeylen);
@@ -978,6 +1022,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
memcpy(key->key, newkey, newkeylen);
key->keylen = newkeylen;
key->family = family;
+ key->prefixlen = prefixlen;
memcpy(&key->addr, addr,
(family == AF_INET6) ? sizeof(struct in6_addr) :
sizeof(struct in_addr));
@@ -986,11 +1031,12 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
}
EXPORT_SYMBOL(tcp_md5_do_add);
-int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
+int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
+ u8 prefixlen)
{
struct tcp_md5sig_key *key;
- key = tcp_md5_do_lookup(sk, addr, family);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
if (!key)
return -ENOENT;
hlist_del_rcu(&key->node);
@@ -1016,11 +1062,12 @@ static void tcp_clear_md5_list(struct sock *sk)
}
}
-static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
- int optlen)
+static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
+ char __user *optval, int optlen)
{
struct tcp_md5sig cmd;
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+ u8 prefixlen = 32;
if (optlen < sizeof(cmd))
return -EINVAL;
@@ -1031,15 +1078,22 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
if (sin->sin_family != AF_INET)
return -EINVAL;
+ if (optname == TCP_MD5SIG_EXT &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
+ prefixlen = cmd.tcpm_prefixlen;
+ if (prefixlen > 32)
+ return -EINVAL;
+ }
+
if (!cmd.tcpm_keylen)
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET);
+ AF_INET, prefixlen);
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
+ AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
GFP_KERNEL);
}
@@ -1342,7 +1396,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
* across. Shucks.
*/
tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
- AF_INET, key->key, key->keylen, GFP_ATOMIC);
+ AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
@@ -1675,6 +1729,8 @@ process:
}
if (nsk == sk) {
reqsk_put(req);
+ } else if (tcp_filter(sk, skb)) {
+ goto discard_and_relse;
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v4_send_reset(nsk, skb);
goto discard_and_relse;
@@ -1860,6 +1916,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_cleanup_congestion_control(sk);
+ tcp_cleanup_ulp(sk);
+
/* Cleanup up the write buffer. */
tcp_write_queue_purge(sk);
@@ -2387,6 +2445,7 @@ struct proto tcp_prot = {
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
+ .leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
@@ -2465,6 +2524,9 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
+ net->ipv4.sysctl_tcp_sack = 1;
+ net->ipv4.sysctl_tcp_window_scaling = 1;
+ net->ipv4.sysctl_tcp_timestamps = 1;
return 0;
fail:
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d0642df73044..d30ee31e94eb 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -98,7 +98,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
- tcp_parse_options(skb, &tmp_opt, 0, NULL);
+ tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
if (tmp_opt.rcv_tsecr)
@@ -559,7 +559,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
- tcp_parse_options(skb, &tmp_opt, 0, NULL);
+ tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 478f75baee31..9a9c395b6235 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -569,18 +569,18 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
opts->mss = tcp_advertise_mss(sk);
remaining -= TCPOLEN_MSS_ALIGNED;
- if (likely(sysctl_tcp_timestamps && !*md5)) {
+ if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
- if (likely(sysctl_tcp_window_scaling)) {
+ if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
opts->ws = tp->rx_opt.rcv_wscale;
opts->options |= OPTION_WSCALE;
remaining -= TCPOLEN_WSCALE_ALIGNED;
}
- if (likely(sysctl_tcp_sack)) {
+ if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!(OPTION_TS & opts->options)))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
@@ -1328,9 +1328,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
return 0;
}
-/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
- * eventually). The difference is that pulled data not copied, but
- * immediately discarded.
+/* This is similar to __pskb_pull_tail(). The difference is that pulled
+ * data is not copied, but immediately discarded.
*/
static int __pskb_trim_head(struct sk_buff *skb, int len)
{
@@ -1365,7 +1364,6 @@ static int __pskb_trim_head(struct sk_buff *skb, int len)
}
shinfo->nr_frags = k;
- skb_reset_tail_pointer(skb);
skb->data_len -= len;
skb->len = skb->data_len;
return len;
@@ -3273,8 +3271,9 @@ static void tcp_connect_init(struct sock *sk)
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/
- tp->tcp_header_len = sizeof(struct tcphdr) +
- (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+ tp->tcp_header_len = sizeof(struct tcphdr);
+ if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
+ tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
#ifdef CONFIG_TCP_MD5SIG
if (tp->af_specific->md5_lookup(sk, sk))
@@ -3305,7 +3304,7 @@ static void tcp_connect_init(struct sock *sk)
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
- sysctl_tcp_window_scaling,
+ sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
&rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index ad99569d4c1e..3330a370d306 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -185,3 +185,4 @@ void tcp_rate_check_app_limited(struct sock *sk)
tp->app_limited =
(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
}
+EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c4a35ba7f8ed..c0feeeef962a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -139,21 +139,17 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
* @timeout: A custom timeout value.
* If set to 0 the default timeout is calculated and used.
* Using TCP_RTO_MIN and the number of unsuccessful retransmits.
- * @syn_set: true if the SYN Bit was set.
*
* The default "timeout" value this function can calculate and use
* is equivalent to the timeout of a TCP Connection
* after "boundary" unsuccessful, exponentially backed-off
- * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
- * syn_set flag is set.
- *
+ * retransmissions with an initial RTO of TCP_RTO_MIN.
*/
static bool retransmits_timed_out(struct sock *sk,
unsigned int boundary,
- unsigned int timeout,
- bool syn_set)
+ unsigned int timeout)
{
- unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
+ const unsigned int rto_base = TCP_RTO_MIN;
unsigned int linear_backoff_thresh, start_ts;
if (!inet_csk(sk)->icsk_retransmits)
@@ -181,8 +177,8 @@ static int tcp_write_timeout(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
+ bool expired, do_reset;
int retry_until;
- bool do_reset, syn_set = false;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits) {
@@ -196,9 +192,9 @@ static int tcp_write_timeout(struct sock *sk)
sk_rethink_txhash(sk);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
- syn_set = true;
+ expired = icsk->icsk_retransmits >= retry_until;
} else {
- if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) {
+ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
/* Some middle-boxes may black-hole Fast Open _after_
* the handshake. Therefore we conservatively disable
* Fast Open on this path on recurring timeouts after
@@ -224,15 +220,15 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
- !retransmits_timed_out(sk, retry_until, 0, 0);
+ !retransmits_timed_out(sk, retry_until, 0);
if (tcp_out_of_resources(sk, do_reset))
return 1;
}
+ expired = retransmits_timed_out(sk, retry_until,
+ icsk->icsk_user_timeout);
}
-
- if (retransmits_timed_out(sk, retry_until,
- syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) {
+ if (expired) {
/* Has it gone just too far? */
tcp_write_err(sk);
return 1;
@@ -540,7 +536,7 @@ out_reset_timer:
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
- if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0))
+ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
__sk_dst_reset(sk);
out:;
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
new file mode 100644
index 000000000000..2417f55374c5
--- /dev/null
+++ b/net/ipv4/tcp_ulp.c
@@ -0,0 +1,135 @@
+/*
+ * Pluggable TCP upper layer protocol support.
+ *
+ * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
+ *
+ */
+
+#include<linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_ulp_list_lock);
+static LIST_HEAD(tcp_ulp_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
+{
+ struct tcp_ulp_ops *e;
+
+ list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
+ if (strcmp(e->name, name) == 0)
+ return e;
+ }
+
+ return NULL;
+}
+
+static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
+{
+ const struct tcp_ulp_ops *ulp = NULL;
+
+ rcu_read_lock();
+ ulp = tcp_ulp_find(name);
+
+#ifdef CONFIG_MODULES
+ if (!ulp && capable(CAP_NET_ADMIN)) {
+ rcu_read_unlock();
+ request_module("%s", name);
+ rcu_read_lock();
+ ulp = tcp_ulp_find(name);
+ }
+#endif
+ if (!ulp || !try_module_get(ulp->owner))
+ ulp = NULL;
+
+ rcu_read_unlock();
+ return ulp;
+}
+
+/* Attach new upper layer protocol to the list
+ * of available protocols.
+ */
+int tcp_register_ulp(struct tcp_ulp_ops *ulp)
+{
+ int ret = 0;
+
+ spin_lock(&tcp_ulp_list_lock);
+ if (tcp_ulp_find(ulp->name)) {
+ pr_notice("%s already registered or non-unique name\n",
+ ulp->name);
+ ret = -EEXIST;
+ } else {
+ list_add_tail_rcu(&ulp->list, &tcp_ulp_list);
+ }
+ spin_unlock(&tcp_ulp_list_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_ulp);
+
+void tcp_unregister_ulp(struct tcp_ulp_ops *ulp)
+{
+ spin_lock(&tcp_ulp_list_lock);
+ list_del_rcu(&ulp->list);
+ spin_unlock(&tcp_ulp_list_lock);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_ulp);
+
+/* Build string with list of available upper layer protocl values */
+void tcp_get_available_ulp(char *buf, size_t maxlen)
+{
+ struct tcp_ulp_ops *ulp_ops;
+ size_t offs = 0;
+
+ *buf = '\0';
+ rcu_read_lock();
+ list_for_each_entry_rcu(ulp_ops, &tcp_ulp_list, list) {
+ offs += snprintf(buf + offs, maxlen - offs,
+ "%s%s",
+ offs == 0 ? "" : " ", ulp_ops->name);
+ }
+ rcu_read_unlock();
+}
+
+void tcp_cleanup_ulp(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (!icsk->icsk_ulp_ops)
+ return;
+
+ if (icsk->icsk_ulp_ops->release)
+ icsk->icsk_ulp_ops->release(sk);
+ module_put(icsk->icsk_ulp_ops->owner);
+}
+
+/* Change upper layer protocol for socket */
+int tcp_set_ulp(struct sock *sk, const char *name)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_ulp_ops *ulp_ops;
+ int err = 0;
+
+ if (icsk->icsk_ulp_ops)
+ return -EEXIST;
+
+ ulp_ops = __tcp_ulp_find_autoload(name);
+ if (!ulp_ops)
+ err = -ENOENT;
+ else
+ err = ulp_ops->init(sk);
+
+ if (err)
+ goto out;
+
+ icsk->icsk_ulp_ops = ulp_ops;
+ out:
+ return err;
+}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fdcb7437cc15..86fad2a14ac4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1163,6 +1163,35 @@ out:
return ret;
}
+#if BITS_PER_LONG == 64
+static void udp_set_dev_scratch(struct sk_buff *skb)
+{
+ struct udp_dev_scratch *scratch;
+
+ BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long));
+ scratch = (struct udp_dev_scratch *)&skb->dev_scratch;
+ scratch->truesize = skb->truesize;
+ scratch->len = skb->len;
+ scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
+ scratch->is_linear = !skb_is_nonlinear(skb);
+}
+
+static int udp_skb_truesize(struct sk_buff *skb)
+{
+ return ((struct udp_dev_scratch *)&skb->dev_scratch)->truesize;
+}
+#else
+static void udp_set_dev_scratch(struct sk_buff *skb)
+{
+ skb->dev_scratch = skb->truesize;
+}
+
+static int udp_skb_truesize(struct sk_buff *skb)
+{
+ return skb->dev_scratch;
+}
+#endif
+
/* fully reclaim rmem/fwd memory allocated for skb */
static void udp_rmem_release(struct sock *sk, int size, int partial,
bool rx_queue_lock_held)
@@ -1213,14 +1242,16 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
*/
void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
{
- udp_rmem_release(sk, skb->dev_scratch, 1, false);
+ prefetch(&skb->data);
+ udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
}
EXPORT_SYMBOL(udp_skb_destructor);
/* as above, but the caller held the rx queue lock, too */
static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
{
- udp_rmem_release(sk, skb->dev_scratch, 1, true);
+ prefetch(&skb->data);
+ udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
}
/* Idea of busylocks is to let producers grab an extra spinlock
@@ -1274,10 +1305,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
busy = busylock_acquire(sk);
}
size = skb->truesize;
- /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss
- * in udp_skb_destructor()
- */
- skb->dev_scratch = size;
+ udp_set_dev_scratch(skb);
/* we drop only if the receive buf is full and the receive
* queue contains some other skb
@@ -1359,7 +1387,8 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
sk_peek_offset_bwd(sk, len);
unlock_sock_fast(sk, slow);
}
- consume_skb(skb);
+
+ consume_stateless_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_consume_udp);
@@ -1369,16 +1398,23 @@ static struct sk_buff *__first_packet_length(struct sock *sk,
{
struct sk_buff *skb;
- while ((skb = skb_peek(rcvq)) != NULL &&
- udp_lib_checksum_complete(skb)) {
- __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
- IS_UDPLITE(sk));
- __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
- IS_UDPLITE(sk));
- atomic_inc(&sk->sk_drops);
- __skb_unlink(skb, rcvq);
- *total += skb->truesize;
- kfree_skb(skb);
+ while ((skb = skb_peek(rcvq)) != NULL) {
+ if (udp_lib_checksum_complete(skb)) {
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
+ IS_UDPLITE(sk));
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ atomic_inc(&sk->sk_drops);
+ __skb_unlink(skb, rcvq);
+ *total += skb->truesize;
+ kfree_skb(skb);
+ } else {
+ /* the csum related bits could be changed, refresh
+ * the scratch area
+ */
+ udp_set_dev_scratch(skb);
+ break;
+ }
}
return skb;
}
@@ -1540,7 +1576,7 @@ try_again:
if (!skb)
return err;
- ulen = skb->len;
+ ulen = udp_skb_len(skb);
copied = len;
if (copied > ulen - off)
copied = ulen - off;
@@ -1555,14 +1591,18 @@ try_again:
if (copied < ulen || peeking ||
(is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
- checksum_valid = !udp_lib_checksum_complete(skb);
+ checksum_valid = udp_skb_csum_unnecessary(skb) ||
+ !__udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
}
- if (checksum_valid || skb_csum_unnecessary(skb))
- err = skb_copy_datagram_msg(skb, off, msg, copied);
- else {
+ if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
+ if (udp_skb_is_linear(skb))
+ err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
+ else
+ err = skb_copy_datagram_msg(skb, off, msg, copied);
+ } else {
err = skb_copy_and_csum_datagram_msg(skb, off, msg);
if (err == -EINVAL)
@@ -1739,6 +1779,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id_once(sk, skb);
}
+ /* clear all pending head states while they are hot in the cache */
+ skb_release_head_state(skb);
+
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
@@ -1853,6 +1896,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
}
+ prefetch(&sk->sk_rmem_alloc);
if (rcu_access_pointer(sk->sk_filter) &&
udp_lib_checksum_complete(skb))
goto csum_error;
@@ -1881,9 +1925,10 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
{
struct dst_entry *old;
- dst_hold(dst);
- old = xchg(&sk->sk_rx_dst, dst);
- dst_release(old);
+ if (dst_hold_safe(dst)) {
+ old = xchg(&sk->sk_rx_dst, dst);
+ dst_release(old);
+ }
}
/*
@@ -2207,13 +2252,11 @@ void udp_v4_early_demux(struct sk_buff *skb)
if (dst)
dst = dst_check(dst, 0);
if (dst) {
- /* DST_NOCACHE can not be used without taking a reference */
- if (dst->flags & DST_NOCACHE) {
- if (likely(atomic_inc_not_zero(&dst->__refcnt)))
- skb_dst_set(skb, dst);
- } else {
- skb_dst_set_noref(skb, dst);
- }
+ /* set noref for now.
+ * any place which wants to hold dst has to call
+ * dst_hold_safe()
+ */
+ skb_dst_set_noref(skb, dst);
}
}
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
index 71acd0014f2d..856d2dfdb44b 100644
--- a/net/ipv4/xfrm4_mode_beet.c
+++ b/net/ipv4/xfrm4_mode_beet.c
@@ -57,8 +57,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
xfrm4_beet_make_header(skb);
- ph = (struct ip_beet_phdr *)
- __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen);
+ ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen);
top_iph = ip_hdr(skb);