diff options
Diffstat (limited to 'net/ipv4')
37 files changed, 1082 insertions, 365 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f83de23a30e7..afcb435adfbe 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ - tcp_rate.o tcp_recovery.o \ + tcp_rate.o tcp_recovery.o tcp_ulp.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f3dad1661343..58925b6597de 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1043,7 +1043,7 @@ static struct inet_protosw inetsw_array[] = .type = SOCK_DGRAM, .protocol = IPPROTO_ICMP, .prot = &ping_prot, - .ops = &inet_dgram_ops, + .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, }, diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 22377c8ff14b..37db44f60718 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -1,5 +1,6 @@ #define pr_fmt(fmt) "IPsec: " fmt +#include <crypto/algapi.h> #include <crypto/hash.h> #include <linux/err.h> #include <linux/module.h> @@ -220,7 +221,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags + sglists); - skb_to_sgvec_nomark(skb, sg, 0, skb->len); + err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ @@ -277,7 +280,7 @@ static void ah_input_done(struct crypto_async_request *base, int err) auth_data = ah_tmp_auth(work_iph, ihl); icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); - err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out; @@ -393,7 +396,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, ihl); sg_init_table(sg, nfrags + sglists); - skb_to_sgvec_nomark(skb, sg, 0, skb->len); + err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ @@ -413,7 +418,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) goto out_free; } - err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out_free; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index ae96e6f3e0cb..8b52179ddc6e 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -539,7 +539,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, skb_reserve(skb, hlen); skb_reset_network_header(skb); - arp = (struct arphdr *) skb_put(skb, arp_hdr_len(dev)); + arp = skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); if (!src_hw) @@ -863,8 +863,8 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); + addr_type = -1; if (n || IN_DEV_ARP_ACCEPT(in_dev)) { - addr_type = -1; is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } @@ -1113,13 +1113,17 @@ static int arp_invalidate(struct net_device *dev, __be32 ip) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; + struct neigh_table *tbl = &arp_tbl; if (neigh) { if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); + write_lock_bh(&tbl->lock); neigh_release(neigh); + neigh_remove_one(neigh, tbl); + write_unlock_bh(&tbl->lock); } return err; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index df14815a3b8c..a7dd088d5fc9 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -176,6 +176,7 @@ EXPORT_SYMBOL(__ip_dev_find); static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); +static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); #ifdef CONFIG_SYSCTL @@ -441,6 +442,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1, **ifap, **last_primary; + struct in_validator_info ivi; + int ret; ASSERT_RTNL(); @@ -471,6 +474,23 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, } } + /* Allow any devices that wish to register ifaddr validtors to weigh + * in now, before changes are committed. The rntl lock is serializing + * access here, so the state should not change between a validator call + * and a final notify on commit. This isn't invoked on promotion under + * the assumption that validators are checking the address itself, and + * not the flags. + */ + ivi.ivi_addr = ifa->ifa_address; + ivi.ivi_dev = ifa->ifa_dev; + ret = blocking_notifier_call_chain(&inetaddr_validator_chain, + NETDEV_UP, &ivi); + ret = notifier_to_errno(ret); + if (ret) { + inet_free_ifa(ifa); + return ret; + } + if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { prandom_seed((__force u32) ifa->ifa_local); ifap = last_primary; @@ -1356,6 +1376,19 @@ int unregister_inetaddr_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_inetaddr_notifier); +int register_inetaddr_validator_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&inetaddr_validator_chain, nb); +} +EXPORT_SYMBOL(register_inetaddr_validator_notifier); + +int unregister_inetaddr_validator_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&inetaddr_validator_chain, + nb); +} +EXPORT_SYMBOL(unregister_inetaddr_validator_notifier); + /* Rename ifa_labels for a device name change. Make some effort to preserve * existing alias numbering and to create unique labels if possible. */ diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 65cc02bd82bc..1f18b4650253 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -248,6 +248,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * u8 *tail; u8 *vaddr; int nfrags; + int esph_offset; struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; @@ -313,11 +314,13 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } cow: + esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb); + nfrags = skb_cow_data(skb, tailen, &trailer); if (nfrags < 0) goto out; tail = skb_tail_pointer(trailer); - esp->esph = ip_esp_hdr(skb); + esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset); skip_cow: esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); @@ -374,9 +377,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * esp->esph = esph; sg_init_table(sg, esp->nfrags); - skb_to_sgvec(skb, sg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + esp->clen + alen); + err = skb_to_sgvec(skb, sg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + if (unlikely(err < 0)) + goto error; if (!esp->inplace) { int allocsize; @@ -400,9 +405,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * spin_unlock_bh(&x->lock); sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); - skb_to_sgvec(skb, dsg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + esp->clen + alen); + err = skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + if (unlikely(err < 0)) + goto error; } if ((x->props.flags & XFRM_STATE_ESN)) @@ -602,7 +609,7 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) * decryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { - esph = (void *)skb_push(skb, 4); + esph = skb_push(skb, 4); *seqhi = esph->spi; esph->spi = esph->seq_no; esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi; @@ -687,7 +694,9 @@ skip_cow: esp_input_set_header(skb, seqhi); sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, 0, skb->len); + err = skb_to_sgvec(skb, sg, 0, skb->len); + if (unlikely(err < 0)) + goto out; skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 14d2f7bd7c76..4e678fa892dd 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -588,7 +588,8 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) - err = fib_table_delete(net, tb, &cfg); + err = fib_table_delete(net, tb, &cfg, + NULL); else err = -ESRCH; } else { @@ -684,7 +685,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, break; case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), - nla_len(attr)); + nla_len(attr), + extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); @@ -701,7 +703,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, break; case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); - err = lwtunnel_valid_encap_type(cfg->fc_encap_type); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type, + extack); if (err < 0) goto errout; break; @@ -732,7 +735,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_table_delete(net, tb, &cfg); + err = fib_table_delete(net, tb, &cfg, extack); errout: return err; } @@ -851,7 +854,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad if (cmd == RTM_NEWROUTE) fib_table_insert(net, tb, &cfg, NULL); else - fib_table_delete(net, tb, &cfg); + fib_table_delete(net, tb, &cfg, NULL); } void fib_add_ifaddr(struct in_ifaddr *ifa) diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 2704e08545da..769ab87ebc4b 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -30,7 +30,8 @@ static inline void fib_alias_accessed(struct fib_alias *fa) void fib_release_info(struct fib_info *); struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); -int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, + struct netlink_ext_ack *extack); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4852e183afe0..ff47ea1408fe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -152,7 +152,8 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp) * free_fib_info_rcu() */ - dst_free(&rt->dst); + dst_dev_put(&rt->dst); + dst_release_immediate(&rt->dst); } static void free_nh_exceptions(struct fib_nh *nh) @@ -194,8 +195,10 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) struct rtable *rt; rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); - if (rt) - dst_free(&rt->dst); + if (rt) { + dst_dev_put(&rt->dst); + dst_release_immediate(&rt->dst); + } } free_percpu(rtp); } @@ -204,6 +207,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + struct dst_metrics *m; change_nexthops(fi) { if (nexthop_nh->nh_dev) @@ -214,8 +218,9 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - if (fi->fib_metrics != (u32 *) dst_default_metrics) - kfree(fi->fib_metrics); + m = fi->fib_metrics; + if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) + kfree(m); kfree(fi); } @@ -530,7 +535,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, ret = lwtunnel_build_state(nla_get_u16( nla_entype), nla, AF_INET, cfg, - &lwtstate); + &lwtstate, extack); if (ret) goto errout; nexthop_nh->nh_lwtstate = @@ -612,7 +617,8 @@ static inline void fib_add_weight(struct fib_info *fi, static int fib_encap_match(u16 encap_type, struct nlattr *encap, const struct fib_nh *nh, - const struct fib_config *cfg) + const struct fib_config *cfg, + struct netlink_ext_ack *extack) { struct lwtunnel_state *lwtstate; int ret, result = 0; @@ -620,8 +626,8 @@ static int fib_encap_match(u16 encap_type, if (encap_type == LWTUNNEL_ENCAP_NONE) return 0; - ret = lwtunnel_build_state(encap_type, encap, - AF_INET, cfg, &lwtstate); + ret = lwtunnel_build_state(encap_type, encap, AF_INET, + cfg, &lwtstate, extack); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); lwtstate_free(lwtstate); @@ -630,7 +636,8 @@ static int fib_encap_match(u16 encap_type, return result; } -int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) +int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, + struct netlink_ext_ack *extack) { #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; @@ -642,9 +649,9 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) if (cfg->fc_oif || cfg->fc_gw) { if (cfg->fc_encap) { - if (fib_encap_match(cfg->fc_encap_type, - cfg->fc_encap, fi->fib_nh, cfg)) - return 1; + if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, + fi->fib_nh, cfg, extack)) + return 1; } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) @@ -1010,11 +1017,11 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; - fi->fib_metrics[type - 1] = val; + fi->fib_metrics->metrics[type - 1] = val; } if (ecn_ca) - fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } @@ -1078,11 +1085,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg, goto failure; fib_info_cnt++; if (cfg->fc_mx) { - fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); if (!fi->fib_metrics) goto failure; + atomic_set(&fi->fib_metrics->refcnt, 1); } else - fi->fib_metrics = (u32 *) dst_default_metrics; + fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; @@ -1145,7 +1153,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } err = lwtunnel_build_state(cfg->fc_encap_type, cfg->fc_encap, AF_INET, cfg, - &lwtstate); + &lwtstate, extack); if (err) goto failure; @@ -1313,7 +1321,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 6d0f6c79d9aa..d56659e97a6e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1099,6 +1099,22 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } +static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) +{ + if (plen > KEYLENGTH) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + return false; + } + + if ((plen < KEYLENGTH) && (key << plen)) { + NL_SET_ERR_MSG(extack, + "Invalid prefix for given prefix length"); + return false; + } + + return true; +} + /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) @@ -1115,16 +1131,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb, u32 key; int err; - if (plen > KEYLENGTH) - return -EINVAL; - key = ntohl(cfg->fc_dst); - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); - - if ((plen < KEYLENGTH) && (key << plen)) + if (!fib_valid_key_len(key, plen, extack)) return -EINVAL; + pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); + fi = fib_create_info(cfg, extack); if (IS_ERR(fi)) { err = PTR_ERR(fi); @@ -1452,6 +1465,7 @@ found: if (!(fib_flags & FIB_LOOKUP_NOREF)) atomic_inc(&fi->fib_clntref); + res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; res->type = fa->fa_type; @@ -1507,7 +1521,7 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, - struct fib_config *cfg) + struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; @@ -1517,12 +1531,9 @@ int fib_table_delete(struct net *net, struct fib_table *tb, u8 tos = cfg->fc_tos; u32 key; - if (plen > KEYLENGTH) - return -EINVAL; - key = ntohl(cfg->fc_dst); - if ((plen < KEYLENGTH) && (key << plen)) + if (!fib_valid_key_len(key, plen, extack)) return -EINVAL; l = fib_find_node(t, &tp, key); @@ -1551,7 +1562,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && - fib_nh_match(cfg, fi) == 0) { + fib_nh_match(cfg, fi, extack) == 0) { fa_to_delete = fa; break; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 43318b5f5647..c2be26b98b5f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -489,7 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key_hash(net, fl4, skb_in); + rt = ip_route_output_key_hash(net, fl4, skb_in); if (IS_ERR(rt)) return rt; @@ -657,8 +657,12 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) /* Needed by both icmp_global_allow and icmp_xmit_lock */ local_bh_disable(); - /* Check global sysctl_icmp_msgs_per_sec ratelimit */ - if (!icmpv4_global_allow(net, type, code)) + /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless + * incoming dev is loopback. If outgoing dev change to not be + * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow) + */ + if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) && + !icmpv4_global_allow(net, type, code)) goto out_bh_enable; sk = icmp_xmit_lock(net); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 44fd86de2823..c4032302d7cd 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -414,7 +414,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, skb = igmpv3_newpack(dev, dev->mtu); if (!skb) return NULL; - pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec)); + pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; @@ -508,7 +508,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, } if (!skb) return NULL; - psrc = (__be32 *)skb_put(skb, sizeof(__be32)); + psrc = skb_put(skb, sizeof(__be32)); *psrc = psf->sf_inaddr; scount++; stotal++; if ((type == IGMPV3_ALLOW_NEW_SOURCES || @@ -742,7 +742,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, ((u8 *)&iph[1])[2] = 0; ((u8 *)&iph[1])[3] = 0; - ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + ih = skb_put(skb, sizeof(struct igmphdr)); ih->type = type; ih->code = 0; ih->csum = 0; @@ -1112,6 +1112,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc = kzalloc(sizeof(*pmc), GFP_KERNEL); if (!pmc) return; + spin_lock_init(&pmc->lock); spin_lock_bh(&im->lock); pmc->interface = im->interface; in_dev_hold(in_dev); @@ -2071,21 +2072,26 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, static void ip_mc_clear_src(struct ip_mc_list *pmc) { - struct ip_sf_list *psf, *nextpsf; + struct ip_sf_list *psf, *nextpsf, *tomb, *sources; - for (psf = pmc->tomb; psf; psf = nextpsf) { + spin_lock_bh(&pmc->lock); + tomb = pmc->tomb; + pmc->tomb = NULL; + sources = pmc->sources; + pmc->sources = NULL; + pmc->sfmode = MCAST_EXCLUDE; + pmc->sfcount[MCAST_INCLUDE] = 0; + pmc->sfcount[MCAST_EXCLUDE] = 1; + spin_unlock_bh(&pmc->lock); + + for (psf = tomb; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } - pmc->tomb = NULL; - for (psf = pmc->sources; psf; psf = nextpsf) { + for (psf = sources; psf; psf = nextpsf) { nextpsf = psf->sf_next; kfree(psf); } - pmc->sources = NULL; - pmc->sfmode = MCAST_EXCLUDE; - pmc->sfcount[MCAST_INCLUDE] = 0; - pmc->sfcount[MCAST_EXCLUDE] = 1; } /* Join a multicast group diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 82dec8825d28..a3fa1a5b6d98 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -790,7 +790,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); - newsk->sk_write_space = sk_stream_write_space; /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e90c80a548ad..7a7829e839c2 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -592,7 +592,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, struct iphdr *iph; struct gre_base_hdr *greh; - iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph)); + iph = skb_push(skb, t->hlen + sizeof(*iph)); greh = (struct gre_base_hdr *)(iph+1); greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags); greh->protocol = htons(type); @@ -779,7 +779,8 @@ static struct pernet_operations ipgre_net_ops = { .size = sizeof(struct ip_tunnel_net), }; -static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { __be16 flags; @@ -802,7 +803,8 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } -static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) +static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { __be32 daddr; @@ -823,7 +825,7 @@ static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) } out: - return ipgre_tunnel_validate(tb, data); + return ipgre_tunnel_validate(tb, data, extack); } static int ipgre_netlink_parms(struct net_device *dev, @@ -957,7 +959,8 @@ static void ipgre_tap_setup(struct net_device *dev) } static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; @@ -979,7 +982,8 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) + struct nlattr *data[], + struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; @@ -1155,7 +1159,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, t = netdev_priv(dev); t->collect_md = true; - err = ipgre_newlink(net, dev, tb, NULL); + err = ipgre_newlink(net, dev, tb, NULL, NULL); if (err < 0) { free_netdev(dev); return ERR_PTR(err); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index b878ecbc0608..129d1a3616f8 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -446,6 +446,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, return 0; drop: + if (tun_dst) + dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } @@ -967,7 +969,6 @@ static void ip_tunnel_dev_free(struct net_device *dev) gro_cells_destroy(&tunnel->gro_cells); dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); - free_netdev(dev); } void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) @@ -1155,7 +1156,8 @@ int ip_tunnel_init(struct net_device *dev) struct iphdr *iph = &tunnel->parms.iph; int err; - dev->destructor = ip_tunnel_dev_free; + dev->needs_free_netdev = true; + dev->priv_destructor = ip_tunnel_dev_free; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index baf196eaf1d8..2f39479be92f 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -134,10 +134,12 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, struct metadata_dst *res; struct ip_tunnel_info *dst, *src; - if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) + if (!md || md->type != METADATA_IP_TUNNEL || + md->u.tun_info.mode & IP_TUNNEL_INFO_TX) + return NULL; - res = metadata_dst_alloc(0, flags); + res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags); if (!res) return NULL; @@ -228,14 +230,16 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { static int ip_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct ip_tunnel_info *tun_info; struct lwtunnel_state *new_state; struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, NULL); + err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, + extack); if (err < 0) return err; @@ -325,7 +329,8 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { static int ip6_tun_build_state(struct nlattr *attr, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct ip_tunnel_info *tun_info; struct lwtunnel_state *new_state; @@ -333,7 +338,7 @@ static int ip6_tun_build_state(struct nlattr *attr, int err; err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, - NULL); + extack); if (err < 0) return err; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 4ec9affb2252..0192c255e508 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -465,7 +465,8 @@ static struct pernet_operations vti_net_ops = { .size = sizeof(struct ip_tunnel_net), }; -static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { return 0; } @@ -503,7 +504,8 @@ static void vti_netlink_parms(struct nlattr *data[], } static int vti_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { struct ip_tunnel_parm parms; __u32 fwmark = 0; @@ -513,7 +515,8 @@ static int vti_newlink(struct net *src_net, struct net_device *dev, } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) + struct nlattr *data[], + struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); __u32 fwmark = t->fwmark; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index c3b12b1c7162..4c5dfe6bd34d 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -813,8 +813,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d if (!skb) return; skb_reserve(skb, hlen); - b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); - memset(b, 0, sizeof(struct bootp_pkt)); + b = skb_put_zero(skb, sizeof(struct bootp_pkt)); /* Construct IP header */ skb_reset_network_header(skb); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 1e441c6f2160..fb1ad22b5e29 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -375,7 +375,8 @@ static int ipip_tunnel_init(struct net_device *dev) return ip_tunnel_init(dev); } -static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) +static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { u8 proto; @@ -469,7 +470,8 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[], } static int ipip_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; @@ -488,7 +490,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) + struct nlattr *data[], + struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 551de4d023a8..bb909f1d7537 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -101,14 +101,15 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id); static void ipmr_free_table(struct mr_table *mrt); static void ip_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *cache, - int local); + struct net_device *dev, struct sk_buff *skb, + struct mfc_cache *cache, int local); static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); +static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); static void mroute_clean_tables(struct mr_table *mrt, bool all); static void ipmr_expire_process(unsigned long arg); @@ -501,7 +502,7 @@ static void reg_vif_setup(struct net_device *dev) dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; - dev->destructor = free_netdev; + dev->needs_free_netdev = true; dev->features |= NETIF_F_NETNS_LOCAL; } @@ -669,7 +670,8 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { - struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + struct nlmsghdr *nlh = skb_pull(skb, + sizeof(struct iphdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); @@ -972,7 +974,8 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, /* Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { - struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + struct nlmsghdr *nlh = skb_pull(skb, + sizeof(struct iphdr)); if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - @@ -988,13 +991,12 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { - ip_mr_forward(net, mrt, skb, c, 0); + ip_mr_forward(net, mrt, skb->dev, skb, c, 0); } } } -/* Bounce a cache query up to mrouted. We could use netlink for this but mrouted - * expects the following bizarre scheme. +/* Bounce a cache query up to mrouted and netlink. * * Called under mrt_lock. */ @@ -1044,7 +1046,7 @@ static int ipmr_cache_report(struct mr_table *mrt, msg->im_vif = vifi; skb_dst_set(skb, dst_clone(skb_dst(pkt))); /* Add our header */ - igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + igmp = skb_put(skb, sizeof(struct igmphdr)); igmp->type = assert; msg->im_msgtype = assert; igmp->code = 0; @@ -1060,6 +1062,8 @@ static int ipmr_cache_report(struct mr_table *mrt, return -EINVAL; } + igmpmsg_netlink_event(mrt, skb); + /* Deliver to mrouted */ ret = sock_queue_rcv_skb(mroute_sk, skb); rcu_read_unlock(); @@ -1073,7 +1077,7 @@ static int ipmr_cache_report(struct mr_table *mrt, /* Queue a packet for resolution. It gets locked cache entry! */ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, - struct sk_buff *skb) + struct sk_buff *skb, struct net_device *dev) { const struct iphdr *iph = ip_hdr(skb); struct mfc_cache *c; @@ -1130,6 +1134,10 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, kfree_skb(skb); err = -ENOBUFS; } else { + if (dev) { + skb->dev = dev; + skb->skb_iif = dev->ifindex; + } skb_queue_tail(&c->mfc_un.unres.unresolved, skb); err = 0; } @@ -1828,10 +1836,10 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) /* "local" means that we should preserve one skb (for local delivery) */ static void ip_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *cache, - int local) + struct net_device *dev, struct sk_buff *skb, + struct mfc_cache *cache, int local) { - int true_vifi = ipmr_find_vif(mrt, skb->dev); + int true_vifi = ipmr_find_vif(mrt, dev); int psend = -1; int vif, ct; @@ -1853,13 +1861,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, } /* Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif_table[vif].dev != skb->dev) { - struct net_device *mdev; - - mdev = l3mdev_master_dev_rcu(mrt->vif_table[vif].dev); - if (mdev == skb->dev) - goto forward; - + if (mrt->vif_table[vif].dev != dev) { if (rt_is_output_route(skb_rtable(skb))) { /* It is our own packet, looped back. * Very complicated situation... @@ -2053,7 +2055,7 @@ int ip_mr_input(struct sk_buff *skb) read_lock(&mrt_lock); vif = ipmr_find_vif(mrt, dev); if (vif >= 0) { - int err2 = ipmr_cache_unresolved(mrt, vif, skb); + int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev); read_unlock(&mrt_lock); return err2; @@ -2064,7 +2066,7 @@ int ip_mr_input(struct sk_buff *skb) } read_lock(&mrt_lock); - ip_mr_forward(net, mrt, skb, cache, local); + ip_mr_forward(net, mrt, dev, skb, cache, local); read_unlock(&mrt_lock); if (local) @@ -2238,7 +2240,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, iph->saddr = saddr; iph->daddr = daddr; iph->version = 0; - err = ipmr_cache_unresolved(mrt, vif, skb2); + err = ipmr_cache_unresolved(mrt, vif, skb2, dev); read_unlock(&mrt_lock); rcu_read_unlock(); return err; @@ -2341,6 +2343,130 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); } +static size_t igmpmsg_netlink_msgsize(size_t payloadlen) +{ + size_t len = + NLMSG_ALIGN(sizeof(struct rtgenmsg)) + + nla_total_size(1) /* IPMRA_CREPORT_MSGTYPE */ + + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */ + + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */ + + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */ + /* IPMRA_CREPORT_PKT */ + + nla_total_size(payloadlen) + ; + + return len; +} + +static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) +{ + struct net *net = read_pnet(&mrt->net); + struct nlmsghdr *nlh; + struct rtgenmsg *rtgenm; + struct igmpmsg *msg; + struct sk_buff *skb; + struct nlattr *nla; + int payloadlen; + + payloadlen = pkt->len - sizeof(struct igmpmsg); + msg = (struct igmpmsg *)skb_network_header(pkt); + + skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC); + if (!skb) + goto errout; + + nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, + sizeof(struct rtgenmsg), 0); + if (!nlh) + goto errout; + rtgenm = nlmsg_data(nlh); + rtgenm->rtgen_family = RTNL_FAMILY_IPMR; + if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) || + nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) || + nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR, + msg->im_src.s_addr) || + nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR, + msg->im_dst.s_addr)) + goto nla_put_failure; + + nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen); + if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg), + nla_data(nla), payloadlen)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC); + return; + +nla_put_failure: + nlmsg_cancel(skb, nlh); +errout: + kfree_skb(skb); + rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS); +} + +static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct nlattr *tb[RTA_MAX + 1]; + struct sk_buff *skb = NULL; + struct mfc_cache *cache; + struct mr_table *mrt; + struct rtmsg *rtm; + __be32 src, grp; + u32 tableid; + int err; + + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, + rtm_ipv4_policy, extack); + if (err < 0) + goto errout; + + rtm = nlmsg_data(nlh); + + src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; + grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; + tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; + + mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); + if (IS_ERR(mrt)) { + err = PTR_ERR(mrt); + goto errout_free; + } + + /* entries are added/deleted only under RTNL */ + rcu_read_lock(); + cache = ipmr_cache_find(mrt, src, grp); + rcu_read_unlock(); + if (!cache) { + err = -ENOENT; + goto errout_free; + } + + skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL); + if (!skb) { + err = -ENOBUFS; + goto errout_free; + } + + err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, cache, + RTM_NEWROUTE, 0); + if (err < 0) + goto errout_free; + + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); + +errout: + return err; + +errout_free: + kfree_skb(skb); + goto errout; +} + static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); @@ -2528,6 +2654,129 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, return ipmr_mfc_delete(tbl, &mfcc, parent); } +static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) +{ + u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len); + + if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) || + nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) || + nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM, + mrt->mroute_reg_vif_num) || + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, + mrt->mroute_do_assert) || + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim)) + return false; + + return true; +} + +static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb) +{ + struct nlattr *vif_nest; + struct vif_device *vif; + + /* if the VIF doesn't exist just continue */ + if (!VIF_EXISTS(mrt, vifid)) + return true; + + vif = &mrt->vif_table[vifid]; + vif_nest = nla_nest_start(skb, IPMRA_VIF); + if (!vif_nest) + return false; + if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) || + nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) || + nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) || + nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in, + IPMRA_VIFA_PAD) || + nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out, + IPMRA_VIFA_PAD) || + nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in, + IPMRA_VIFA_PAD) || + nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out, + IPMRA_VIFA_PAD) || + nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) || + nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) { + nla_nest_cancel(skb, vif_nest); + return false; + } + nla_nest_end(skb, vif_nest); + + return true; +} + +static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nlmsghdr *nlh = NULL; + unsigned int t = 0, s_t; + unsigned int e = 0, s_e; + struct mr_table *mrt; + + s_t = cb->args[0]; + s_e = cb->args[1]; + + ipmr_for_each_table(mrt, net) { + struct nlattr *vifs, *af; + struct ifinfomsg *hdr; + u32 i; + + if (t < s_t) + goto skip_table; + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, RTM_NEWLINK, + sizeof(*hdr), NLM_F_MULTI); + if (!nlh) + break; + + hdr = nlmsg_data(nlh); + memset(hdr, 0, sizeof(*hdr)); + hdr->ifi_family = RTNL_FAMILY_IPMR; + + af = nla_nest_start(skb, IFLA_AF_SPEC); + if (!af) { + nlmsg_cancel(skb, nlh); + goto out; + } + + if (!ipmr_fill_table(mrt, skb)) { + nlmsg_cancel(skb, nlh); + goto out; + } + + vifs = nla_nest_start(skb, IPMRA_TABLE_VIFS); + if (!vifs) { + nla_nest_end(skb, af); + nlmsg_end(skb, nlh); + goto out; + } + for (i = 0; i < mrt->maxvif; i++) { + if (e < s_e) + goto skip_entry; + if (!ipmr_fill_vif(mrt, i, skb)) { + nla_nest_end(skb, vifs); + nla_nest_end(skb, af); + nlmsg_end(skb, nlh); + goto out; + } +skip_entry: + e++; + } + s_e = 0; + e = 0; + nla_nest_end(skb, vifs); + nla_nest_end(skb, af); + nlmsg_end(skb, nlh); +skip_table: + t++; + } + +out: + cb->args[1] = e; + cb->args[0] = t; + + return skb->len; +} + #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing : * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif @@ -2865,11 +3114,14 @@ int __init ip_mr_init(void) } #endif rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, - NULL, ipmr_rtm_dumproute, NULL); + ipmr_rtm_getroute, ipmr_rtm_dumproute, NULL); rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE, ipmr_rtm_route, NULL, NULL); rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE, ipmr_rtm_route, NULL, NULL); + + rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK, + NULL, ipmr_rtm_dumplink, NULL); return 0; #ifdef CONFIG_IP_PIMSM_V2 diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index af2b69b6895f..f1528f7175a8 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -24,7 +24,7 @@ synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, struct iphdr *iph; skb_reset_network_header(skb); - iph = (struct iphdr *)skb_put(skb, sizeof(*iph)); + iph = skb_put(skb, sizeof(*iph)); iph->version = 4; iph->ihl = sizeof(*iph) / 4; iph->tos = 0; @@ -91,7 +91,7 @@ synproxy_send_client_synack(struct net *net, niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); - nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth = skb_put(nskb, tcp_hdr_size); nth->source = th->dest; nth->dest = th->source; nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss)); @@ -133,7 +133,7 @@ synproxy_send_server_syn(struct net *net, niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); - nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth = skb_put(nskb, tcp_hdr_size); nth->source = th->source; nth->dest = th->dest; nth->seq = htonl(recv_seq - 1); @@ -178,7 +178,7 @@ synproxy_send_server_ack(struct net *net, niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); - nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth = skb_put(nskb, tcp_hdr_size); nth->source = th->dest; nth->dest = th->source; nth->seq = htonl(ntohl(th->ack_seq)); @@ -216,7 +216,7 @@ synproxy_send_client_ack(struct net *net, niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); - nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); + nth = skb_put(nskb, tcp_hdr_size); nth->source = th->source; nth->dest = th->dest; nth->seq = htonl(ntohl(th->seq) + 1); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 6f8d9e5e062b..eeacbdaf7cdf 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -51,7 +51,7 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, struct iphdr *niph, *oiph = ip_hdr(oldskb); skb_reset_network_header(nskb); - niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr)); + niph = skb_put(nskb, sizeof(struct iphdr)); niph->version = 4; niph->ihl = sizeof(struct iphdr) / 4; niph->tos = 0; @@ -76,8 +76,7 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, struct tcphdr *tcph; skb_reset_transport_header(nskb); - tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr)); - memset(tcph, 0, sizeof(*tcph)); + tcph = skb_put_zero(nskb, sizeof(struct tcphdr)); tcph->source = oth->dest; tcph->dest = oth->source; tcph->doff = sizeof(struct tcphdr) / 4; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index fa44e752a9a3..43eb6567b3a0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -250,6 +250,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), + SNMP_MIB_ITEM("TCPMemoryPressuresChrono", LINUX_MIB_TCPMEMORYPRESSURESCHRONO), SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD), SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD), SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 655d9eebe43e..c816cd53f7fc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -114,6 +114,8 @@ #include <net/ip_tunnels.h> #include <net/l3mdev.h> +#include "fib_lookup.h" + #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -587,11 +589,6 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, build_sk_flow_key(fl4, sk); } -static inline void rt_free(struct rtable *rt) -{ - call_rcu(&rt->dst.rcu_head, dst_rcu_free); -} - static DEFINE_SPINLOCK(fnhe_lock); static void fnhe_flush_routes(struct fib_nh_exception *fnhe) @@ -601,12 +598,14 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe) rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); - rt_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); - rt_free(rt); + dst_dev_put(&rt->dst); + dst_release(&rt->dst); } } @@ -1300,7 +1299,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, - __be32 daddr) + __be32 daddr, const bool do_cache) { bool ret = false; @@ -1329,10 +1328,13 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, if (!rt->rt_gateway) rt->rt_gateway = daddr; - if (!(rt->dst.flags & DST_NOCACHE)) { + if (do_cache) { + dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); - if (orig) - rt_free(orig); + if (orig) { + dst_dev_put(&orig->dst); + dst_release(&orig->dst); + } ret = true; } @@ -1355,12 +1357,20 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) } orig = *p; + /* hold dst before doing cmpxchg() to avoid race condition + * on this dst + */ + dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { - if (orig) - rt_free(orig); - } else + if (orig) { + dst_dev_put(&orig->dst); + dst_release(&orig->dst); + } + } else { + dst_release(&rt->dst); ret = false; + } return ret; } @@ -1385,8 +1395,12 @@ static void rt_add_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *) dst; + if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt)) + kfree(p); + if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; @@ -1427,7 +1441,8 @@ static bool rt_cache_valid(const struct rtable *rt) static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, - struct fib_info *fi, u16 type, u32 itag) + struct fib_info *fi, u16 type, u32 itag, + const bool do_cache) { bool cached = false; @@ -1438,14 +1453,18 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); + if (fi->fib_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + atomic_inc(&fi->fib_metrics->refcnt); + } #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (unlikely(fnhe)) - cached = rt_bind_exception(rt, fnhe, daddr); - else if (!(rt->dst.flags & DST_NOCACHE)) + cached = rt_bind_exception(rt, fnhe, daddr, do_cache); + else if (do_cache) cached = rt_cache_route(nh, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or @@ -1453,7 +1472,6 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ - rt->dst.flags |= DST_NOCACHE; if (!rt->rt_gateway) rt->rt_gateway = daddr; rt_add_uncached_list(rt); @@ -1476,7 +1494,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | + (will_cache ? 0 : DST_HOST) | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); @@ -1720,7 +1738,8 @@ rt_cache: rth->dst.input = ip_forward; - rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, + do_cache); set_lwt_redirect(rth); skb_dst_set(skb, &rth->dst); out: @@ -1852,9 +1871,9 @@ static int ip_mkroute_input(struct sk_buff *skb, */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) + u8 tos, struct net_device *dev, + struct fib_result *res) { - struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); struct ip_tunnel_info *tun_info; struct flowi4 fl4; @@ -1884,8 +1903,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; - res.fi = NULL; - res.table = NULL; + res->fi = NULL; + res->table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; @@ -1921,17 +1940,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); - err = fib_lookup(net, &fl4, &res, 0); + err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; goto no_route; } - if (res.type == RTN_BROADCAST) + if (res->type == RTN_BROADCAST) goto brd_input; - if (res.type == RTN_LOCAL) { + if (res->type == RTN_LOCAL) { err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &itag); if (err < 0) @@ -1943,10 +1962,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = -EHOSTUNREACH; goto no_route; } - if (res.type != RTN_UNICAST) + if (res->type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos); out: return err; brd_input: @@ -1960,14 +1979,14 @@ brd_input: goto martian_source; } flags |= RTCF_BROADCAST; - res.type = RTN_BROADCAST; + res->type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: do_cache = false; - if (res.fi) { + if (res->fi) { if (!itag) { - rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); err = 0; @@ -1978,7 +1997,7 @@ local_input: } rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, - flags | RTCF_LOCAL, res.type, + flags | RTCF_LOCAL, res->type, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); if (!rth) goto e_nobufs; @@ -1988,18 +2007,18 @@ local_input: rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; - if (res.table) - rth->rt_table_id = res.table->tb_id; + if (res->table) + rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); - if (res.type == RTN_UNREACHABLE) { + if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { - struct fib_nh *nh = &FIB_RES_NH(res); + struct fib_nh *nh = &FIB_RES_NH(*res); rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { @@ -2008,10 +2027,8 @@ local_input: rth->dst.input = lwtunnel_input; } - if (unlikely(!rt_cache_route(nh, rth))) { - rth->dst.flags |= DST_NOCACHE; + if (unlikely(!rt_cache_route(nh, rth))) rt_add_uncached_list(rth); - } } skb_dst_set(skb, &rth->dst); err = 0; @@ -2019,9 +2036,9 @@ local_input: no_route: RT_CACHE_STAT_INC(in_no_route); - res.type = RTN_UNREACHABLE; - res.fi = NULL; - res.table = NULL; + res->type = RTN_UNREACHABLE; + res->fi = NULL; + res->table = NULL; goto local_input; /* @@ -2051,11 +2068,22 @@ martian_source: int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev) { - int res; + struct fib_result res; + int err; tos &= IPTOS_RT_MASK; rcu_read_lock(); + err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(ip_route_input_noref); + +/* called with rcu_read_lock held */ +int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, struct fib_result *res) +{ /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting @@ -2070,6 +2098,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_multicast(daddr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; + int err = -EINVAL; if (in_dev) our = ip_check_mc_rcu(in_dev, daddr, saddr, @@ -2085,7 +2114,6 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol); } - res = -EINVAL; if (our #ifdef CONFIG_IP_MROUTE || @@ -2093,17 +2121,14 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, IN_DEV_MFORWARD(in_dev)) #endif ) { - res = ip_route_input_mc(skb, daddr, saddr, + err = ip_route_input_mc(skb, daddr, saddr, tos, dev, our); } - rcu_read_unlock(); - return res; + return err; } - res = ip_route_input_slow(skb, daddr, saddr, tos, dev); - rcu_read_unlock(); - return res; + + return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); } -EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, @@ -2199,10 +2224,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth = rcu_dereference(*prth); rt_cache: - if (rt_cache_valid(rth)) { - dst_hold(&rth->dst); + if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) return rth; - } } add: @@ -2236,7 +2259,7 @@ add: #endif } - rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); + rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); set_lwt_redirect(rth); return rth; @@ -2246,29 +2269,40 @@ add: * Major route resolver routine. */ -struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, - const struct sk_buff *skb) +struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, + const struct sk_buff *skb) { - struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); - unsigned int flags = 0; struct fib_result res; struct rtable *rth; - int orig_oif; - int err = -ENETUNREACH; res.tclassid = 0; res.fi = NULL; res.table = NULL; - orig_oif = fl4->flowi4_oif; - fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = ((tos & RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); rcu_read_lock(); + rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); + rcu_read_unlock(); + + return rth; +} +EXPORT_SYMBOL_GPL(ip_route_output_key_hash); + +struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, + struct fib_result *res, + const struct sk_buff *skb) +{ + struct net_device *dev_out = NULL; + int orig_oif = fl4->flowi4_oif; + unsigned int flags = 0; + struct rtable *rth; + int err = -ENETUNREACH; + if (fl4->saddr) { rth = ERR_PTR(-EINVAL); if (ipv4_is_multicast(fl4->saddr) || @@ -2354,15 +2388,15 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = LOOPBACK_IFINDEX; - res.type = RTN_LOCAL; + res->type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - err = fib_lookup(net, fl4, &res, 0); + err = fib_lookup(net, fl4, res, 0); if (err) { - res.fi = NULL; - res.table = NULL; + res->fi = NULL; + res->table = NULL; if (fl4->flowi4_oif && (ipv4_is_multicast(fl4->daddr) || !netif_index_is_l3_master(net, fl4->flowi4_oif))) { @@ -2387,43 +2421,41 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, if (fl4->saddr == 0) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); - res.type = RTN_UNICAST; + res->type = RTN_UNICAST; goto make_route; } rth = ERR_PTR(err); goto out; } - if (res.type == RTN_LOCAL) { + if (res->type == RTN_LOCAL) { if (!fl4->saddr) { - if (res.fi->fib_prefsrc) - fl4->saddr = res.fi->fib_prefsrc; + if (res->fi->fib_prefsrc) + fl4->saddr = res->fi->fib_prefsrc; else fl4->saddr = fl4->daddr; } /* L3 master device is the loopback for that domain */ - dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(res)) ? : + dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } - fib_select_path(net, &res, fl4, skb); + fib_select_path(net, res, fl4, skb); - dev_out = FIB_RES_DEV(res); + dev_out = FIB_RES_DEV(*res); fl4->flowi4_oif = dev_out->ifindex; make_route: - rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); + rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); out: - rcu_read_unlock(); return rth; } -EXPORT_SYMBOL_GPL(__ip_route_output_key_hash); static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) { @@ -2477,7 +2509,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->input = dst_discard; new->output = dst_discard_out; - new->dev = ort->dst.dev; + new->dev = net->loopback_dev; if (new->dev) dev_hold(new->dev); @@ -2492,7 +2524,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - dst_free(new); } dst_release(dst_orig); @@ -2517,9 +2548,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, } EXPORT_SYMBOL_GPL(ip_route_output_flow); +/* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, - u32 seq, int event) + u32 seq) { struct rtable *rt = skb_rtable(skb); struct rtmsg *r; @@ -2528,7 +2560,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, u32 error; u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0); + nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); if (!nlh) return -EMSGSIZE; @@ -2636,6 +2668,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; + struct fib_result res = {}; struct rtable *rt = NULL; struct flowi4 fl4; __be32 dst = 0; @@ -2692,10 +2725,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; + rcu_read_lock(); + if (iif) { struct net_device *dev; - dev = __dev_get_by_index(net, iif); + dev = dev_get_by_index_rcu(net, iif); if (!dev) { err = -ENODEV; goto errout_free; @@ -2704,14 +2739,14 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb->protocol = htons(ETH_P_IP); skb->dev = dev; skb->mark = mark; - err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, + dev, &res); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { - rt = ip_route_output_key(net, &fl4); - + rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); @@ -2727,17 +2762,25 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = rt->rt_table_id; - err = rt_fill_info(net, dst, src, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - RTM_NEWROUTE); + if (rtm->rtm_flags & RTM_F_FIB_MATCH) + err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, RTM_NEWROUTE, table_id, + rt->rt_type, res.prefix, res.prefixlen, + fl4.flowi4_tos, res.fi, 0); + else + err = rt_fill_info(net, dst, src, table_id, &fl4, skb, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); if (err < 0) goto errout_free; + rcu_read_unlock(); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; errout_free: + rcu_read_unlock(); kfree_skb(skb); goto errout; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 6426250a58ea..7835bb4a1fab 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -232,7 +232,8 @@ EXPORT_SYMBOL(tcp_get_cookie_sock); * return false if we decode a tcp option that is disabled * on the host. */ -bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt) +bool cookie_timestamp_decode(const struct net *net, + struct tcp_options_received *tcp_opt) { /* echoed timestamp, lowest bits contain options */ u32 options = tcp_opt->rcv_tsecr; @@ -242,12 +243,12 @@ bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt) return true; } - if (!sysctl_tcp_timestamps) + if (!net->ipv4.sysctl_tcp_timestamps) return false; tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; - if (tcp_opt->sack_ok && !sysctl_tcp_sack) + if (tcp_opt->sack_ok && !net->ipv4.sysctl_tcp_sack) return false; if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) @@ -256,7 +257,7 @@ bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt) tcp_opt->wscale_ok = 1; tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; - return sysctl_tcp_window_scaling != 0; + return net->ipv4.sysctl_tcp_window_scaling != 0; } EXPORT_SYMBOL(cookie_timestamp_decode); @@ -312,14 +313,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, 0, NULL); + tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL); if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { - tsoff = secure_tcp_ts_off(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); + tsoff = secure_tcp_ts_off(sock_net(sk), + ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr); tcp_opt.rcv_tsecr -= tsoff; } - if (!cookie_timestamp_decode(&tcp_opt)) + if (!cookie_timestamp_decode(sock_net(sk), &tcp_opt)) goto out; ret = NULL; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 86957e9cd6c6..9bf809726066 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -360,32 +360,30 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) tcp_fastopen_active_timeout_reset(); + + return ret; +} + +static int proc_tcp_available_ulp(struct ctl_table *ctl, + int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + kfree(tbl.data); + return ret; } static struct ctl_table ipv4_table[] = { { - .procname = "tcp_timestamps", - .data = &sysctl_tcp_timestamps, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_window_scaling", - .data = &sysctl_tcp_window_scaling, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_sack", - .data = &sysctl_tcp_sack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_retrans_collapse", .data = &sysctl_tcp_retrans_collapse, .maxlen = sizeof(int), @@ -707,6 +705,12 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_ms_jiffies, }, { + .procname = "tcp_available_ulp", + .maxlen = TCP_ULP_BUF_MAX, + .mode = 0444, + .proc_handler = proc_tcp_available_ulp, + }, + { .procname = "icmp_msgs_per_sec", .data = &sysctl_icmp_msgs_per_sec, .maxlen = sizeof(int), @@ -1116,6 +1120,27 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = &one, }, #endif + { + .procname = "tcp_sack", + .data = &init_net.ipv4.sysctl_tcp_sack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_window_scaling", + .data = &init_net.ipv4.sysctl_tcp_window_scaling, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_timestamps", + .data = &init_net.ipv4.sysctl_tcp_timestamps, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index aaf663a1e571..4c88d20d91d4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -320,17 +320,36 @@ struct tcp_splice_state { * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ -int tcp_memory_pressure __read_mostly; -EXPORT_SYMBOL(tcp_memory_pressure); +unsigned long tcp_memory_pressure __read_mostly; +EXPORT_SYMBOL_GPL(tcp_memory_pressure); void tcp_enter_memory_pressure(struct sock *sk) { - if (!tcp_memory_pressure) { + unsigned long val; + + if (tcp_memory_pressure) + return; + val = jiffies; + + if (!val) + val--; + if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); - tcp_memory_pressure = 1; - } } -EXPORT_SYMBOL(tcp_enter_memory_pressure); +EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure); + +void tcp_leave_memory_pressure(struct sock *sk) +{ + unsigned long val; + + if (!tcp_memory_pressure) + return; + val = xchg(&tcp_memory_pressure, 0); + if (val) + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, + jiffies_to_msecs(jiffies - val)); +} +EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) @@ -882,8 +901,8 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } -static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, - size_t size, int flags) +ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, + size_t size, int flags) { struct tcp_sock *tp = tcp_sk(sk); int mss_now, size_goal; @@ -1013,6 +1032,7 @@ out_err: } return sk_stream_error(sk, flags, err); } +EXPORT_SYMBOL_GPL(do_tcp_sendpages); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) @@ -1084,9 +1104,12 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); + struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) + if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && + uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ @@ -1108,7 +1131,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; - err = __inet_stream_connect(sk->sk_socket, msg->msg_name, + err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst @@ -2378,9 +2401,10 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l return 0; } -static int tcp_repair_options_est(struct tcp_sock *tp, +static int tcp_repair_options_est(struct sock *sk, struct tcp_repair_opt __user *optbuf, unsigned int len) { + struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; while (len >= sizeof(opt)) { @@ -2393,6 +2417,7 @@ static int tcp_repair_options_est(struct tcp_sock *tp, switch (opt.opt_code) { case TCPOPT_MSS: tp->rx_opt.mss_clamp = opt.opt_val; + tcp_mtup_init(sk); break; case TCPOPT_WINDOW: { @@ -2458,6 +2483,24 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } + case TCP_ULP: { + char name[TCP_ULP_NAME_MAX]; + + if (optlen < 1) + return -EINVAL; + + val = strncpy_from_user(name, optval, + min_t(long, TCP_ULP_NAME_MAX - 1, + optlen)); + if (val < 0) + return -EFAULT; + name[val] = 0; + + lock_sock(sk); + err = tcp_set_ulp(sk, name); + release_sock(sk); + return err; + } default: /* fallthru */ break; @@ -2553,7 +2596,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED) - err = tcp_repair_options_est(tp, + err = tcp_repair_options_est(sk, (struct tcp_repair_opt __user *)optval, optlen); else @@ -2671,8 +2714,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: + case TCP_MD5SIG_EXT: /* Read the IP->Key mappings from userspace */ - err = tp->af_specific->md5_parse(sk, optval, optlen); + err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_USER_TIMEOUT: @@ -3014,6 +3058,21 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; return 0; + case TCP_ULP: + if (get_user(len, optlen)) + return -EFAULT; + len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); + if (!icsk->icsk_ulp_ops) { + if (put_user(0, optlen)) + return -EFAULT; + return 0; + } + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len)) + return -EFAULT; + return 0; + case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6e3c512054a6..324c9bcc5456 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -180,6 +180,7 @@ void tcp_init_congestion_control(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2fa55f57ac06..2ab7e2fa9bb9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -76,9 +76,6 @@ #include <asm/unaligned.h> #include <linux/errqueue.h> -int sysctl_tcp_timestamps __read_mostly = 1; -int sysctl_tcp_window_scaling __read_mostly = 1; -int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly; int sysctl_tcp_max_reordering __read_mostly = 300; int sysctl_tcp_dsack __read_mostly = 1; @@ -112,6 +109,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ +#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -2920,9 +2918,9 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) rtt_us ? : jiffies_to_usecs(1)); } -static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - long seq_rtt_us, long sack_rtt_us, - long ca_rtt_us) +static bool tcp_ack_update_rtt(struct sock *sk, const int flag, + long seq_rtt_us, long sack_rtt_us, + long ca_rtt_us, struct rate_sample *rs) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2947,6 +2945,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, seq_rtt_us = ca_rtt_us = delta_us; } + rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) return false; @@ -2966,12 +2965,13 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) { + struct rate_sample rs; long rtt_us = -1L; if (req && !req->num_retrans && tcp_rsk(req)->snt_synack) rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack); - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us); + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs); } @@ -3176,9 +3176,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt); } - sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, - ca_rtt_us); + ca_rtt_us, sack->rate); if (flag & FLAG_ACKED) { tcp_rearm_rto(sk); @@ -3214,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = ca_rtt_us, + .rtt_us = sack->rate->rtt_us, .in_flight = last_in_flight }; icsk->icsk_ca_ops->pkts_acked(sk, &sample); @@ -3568,7 +3567,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) { /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - tp->max_window)) { - tcp_send_challenge_ack(sk, skb); + if (!(flag & FLAG_NO_CHALLENGE_ACK)) + tcp_send_challenge_ack(sk, skb); return -1; } goto old_ack; @@ -3721,7 +3721,8 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, * But, this can also be called on packets in the established flow when * the fast version below fails. */ -void tcp_parse_options(const struct sk_buff *skb, +void tcp_parse_options(const struct net *net, + const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc) { @@ -3762,7 +3763,7 @@ void tcp_parse_options(const struct sk_buff *skb, break; case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW && th->syn && - !estab && sysctl_tcp_window_scaling) { + !estab && net->ipv4.sysctl_tcp_window_scaling) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; if (snd_wscale > TCP_MAX_WSCALE) { @@ -3778,7 +3779,7 @@ void tcp_parse_options(const struct sk_buff *skb, case TCPOPT_TIMESTAMP: if ((opsize == TCPOLEN_TIMESTAMP) && ((estab && opt_rx->tstamp_ok) || - (!estab && sysctl_tcp_timestamps))) { + (!estab && net->ipv4.sysctl_tcp_timestamps))) { opt_rx->saw_tstamp = 1; opt_rx->rcv_tsval = get_unaligned_be32(ptr); opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); @@ -3786,7 +3787,7 @@ void tcp_parse_options(const struct sk_buff *skb, break; case TCPOPT_SACK_PERM: if (opsize == TCPOLEN_SACK_PERM && th->syn && - !estab && sysctl_tcp_sack) { + !estab && net->ipv4.sysctl_tcp_sack) { opt_rx->sack_ok = TCP_SACK_SEEN; tcp_sack_reset(opt_rx); } @@ -3855,7 +3856,8 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ -static bool tcp_fast_parse_options(const struct sk_buff *skb, +static bool tcp_fast_parse_options(const struct net *net, + const struct sk_buff *skb, const struct tcphdr *th, struct tcp_sock *tp) { /* In the spirit of fast parsing, compare doff directly to constant @@ -3870,7 +3872,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, return true; } - tcp_parse_options(skb, &tp->rx_opt, 1, NULL); + tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; @@ -5231,7 +5233,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, bool rst_seq_match = false; /* RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) && + tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); @@ -5602,7 +5605,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, /* Get original SYNACK MSS value if user MSS sets mss_clamp */ tcp_clear_options(&opt); opt.user_mss = opt.mss_clamp = 0; - tcp_parse_options(synack, &opt, 0, NULL); + tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL); mss = opt.mss_clamp; } @@ -5656,7 +5659,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, int saved_clamp = tp->rx_opt.mss_clamp; bool fastopen_fail; - tcp_parse_options(skb, &tp->rx_opt, 0, &foc); + tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; @@ -5951,13 +5954,17 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) /* step 5: check the ACK field */ acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT) > 0; + FLAG_UPDATE_TS_RECENT | + FLAG_NO_CHALLENGE_ACK) > 0; + if (!acceptable) { + if (sk->sk_state == TCP_SYN_RECV) + return 1; /* send one RST */ + tcp_send_challenge_ack(sk, skb); + goto discard; + } switch (sk->sk_state) { case TCP_SYN_RECV: - if (!acceptable) - return 1; - if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); @@ -6026,14 +6033,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * our SYNACK so stop the SYNACK timer. */ if (req) { - /* Return RST if ack_seq is invalid. - * Note that RFC793 only says to generate a - * DUPACK for it but for TCP Fast Open it seems - * better to treat this case like TCP_SYN_RECV - * above. - */ - if (!acceptable) - return 1; /* We no longer need the request sock. */ reqsk_fastopen_remove(sk, req, false); tcp_rearm_rto(sk); @@ -6333,7 +6332,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; tmp_opt.user_mss = tp->rx_opt.user_mss; - tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); + tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, + want_cookie ? NULL : &foc); if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); @@ -6351,7 +6351,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_free; if (tmp_opt.tstamp_ok) - tcp_rsk(req)->ts_off = af_ops->init_ts_off(skb); + tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); if (!want_cookie && !isn) { /* Kill the following clause, if you dislike this way. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 191b2f78b19d..d774bcd9a54b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -80,6 +80,7 @@ #include <linux/stddef.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/inetdevice.h> #include <crypto/hash.h> #include <linux/scatterlist.h> @@ -102,10 +103,9 @@ static u32 tcp_v4_init_seq(const struct sk_buff *skb) tcp_hdr(skb)->source); } -static u32 tcp_v4_init_ts_off(const struct sk_buff *skb) +static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) { - return secure_tcp_ts_off(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr); + return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -242,7 +242,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr, inet->inet_sport, usin->sin_port); - tp->tsoffset = secure_tcp_ts_off(inet->inet_saddr, + tp->tsoffset = secure_tcp_ts_off(sock_net(sk), + inet->inet_saddr, inet->inet_daddr); } @@ -906,6 +907,48 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; + const struct tcp_md5sig_info *md5sig; + __be32 mask; + struct tcp_md5sig_key *best_match = NULL; + bool match; + + /* caller either holds rcu_read_lock() or socket lock */ + md5sig = rcu_dereference_check(tp->md5sig_info, + lockdep_sock_is_held(sk)); + if (!md5sig) + return NULL; + + hlist_for_each_entry_rcu(key, &md5sig->head, node) { + if (key->family != family) + continue; + + if (family == AF_INET) { + mask = inet_make_mask(key->prefixlen); + match = (key->addr.a4.s_addr & mask) == + (addr->a4.s_addr & mask); +#if IS_ENABLED(CONFIG_IPV6) + } else if (family == AF_INET6) { + match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, + key->prefixlen); +#endif + } else { + match = false; + } + + if (match && (!best_match || + key->prefixlen > best_match->prefixlen)) + best_match = key; + } + return best_match; +} +EXPORT_SYMBOL(tcp_md5_do_lookup); + +struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, + const union tcp_md5_addr *addr, + int family, u8 prefixlen) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_key *key; unsigned int size = sizeof(struct in_addr); const struct tcp_md5sig_info *md5sig; @@ -921,12 +964,12 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; - if (!memcmp(&key->addr, addr, size)) + if (!memcmp(&key->addr, addr, size) && + key->prefixlen == prefixlen) return key; } return NULL; } -EXPORT_SYMBOL(tcp_md5_do_lookup); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) @@ -940,14 +983,15 @@ EXPORT_SYMBOL(tcp_v4_md5_lookup); /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) + int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, + gfp_t gfp) { /* Add Key to the list */ struct tcp_md5sig_key *key; struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup(sk, addr, family); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); if (key) { /* Pre-existing entry - just update that one. */ memcpy(key->key, newkey, newkeylen); @@ -978,6 +1022,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, memcpy(key->key, newkey, newkeylen); key->keylen = newkeylen; key->family = family; + key->prefixlen = prefixlen; memcpy(&key->addr, addr, (family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); @@ -986,11 +1031,12 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, } EXPORT_SYMBOL(tcp_md5_do_add); -int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) +int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, + u8 prefixlen) { struct tcp_md5sig_key *key; - key = tcp_md5_do_lookup(sk, addr, family); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); if (!key) return -ENOENT; hlist_del_rcu(&key->node); @@ -1016,11 +1062,12 @@ static void tcp_clear_md5_list(struct sock *sk) } } -static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, - int optlen) +static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, + char __user *optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; + u8 prefixlen = 32; if (optlen < sizeof(cmd)) return -EINVAL; @@ -1031,15 +1078,22 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, if (sin->sin_family != AF_INET) return -EINVAL; + if (optname == TCP_MD5SIG_EXT && + cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { + prefixlen = cmd.tcpm_prefixlen; + if (prefixlen > 32) + return -EINVAL; + } + if (!cmd.tcpm_keylen) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET); + AF_INET, prefixlen); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, + AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } @@ -1342,7 +1396,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * across. Shucks. */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET, key->key, key->keylen, GFP_ATOMIC); + AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } #endif @@ -1675,6 +1729,8 @@ process: } if (nsk == sk) { reqsk_put(req); + } else if (tcp_filter(sk, skb)) { + goto discard_and_relse; } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; @@ -1860,6 +1916,8 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_cleanup_congestion_control(sk); + tcp_cleanup_ulp(sk); + /* Cleanup up the write buffer. */ tcp_write_queue_purge(sk); @@ -2387,6 +2445,7 @@ struct proto tcp_prot = { .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, @@ -2465,6 +2524,9 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); + net->ipv4.sysctl_tcp_sack = 1; + net->ipv4.sysctl_tcp_window_scaling = 1; + net->ipv4.sysctl_tcp_timestamps = 1; return 0; fail: diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d0642df73044..d30ee31e94eb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -98,7 +98,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(skb, &tmp_opt, 0, NULL); + tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) @@ -559,7 +559,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, 0, NULL); + tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 478f75baee31..9a9c395b6235 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -569,18 +569,18 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, opts->mss = tcp_advertise_mss(sk); remaining -= TCPOLEN_MSS_ALIGNED; - if (likely(sysctl_tcp_timestamps && !*md5)) { + if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset; opts->tsecr = tp->rx_opt.ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } - if (likely(sysctl_tcp_window_scaling)) { + if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; remaining -= TCPOLEN_WSCALE_ALIGNED; } - if (likely(sysctl_tcp_sack)) { + if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) remaining -= TCPOLEN_SACKPERM_ALIGNED; @@ -1328,9 +1328,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, return 0; } -/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c - * eventually). The difference is that pulled data not copied, but - * immediately discarded. +/* This is similar to __pskb_pull_tail(). The difference is that pulled + * data is not copied, but immediately discarded. */ static int __pskb_trim_head(struct sk_buff *skb, int len) { @@ -1365,7 +1364,6 @@ static int __pskb_trim_head(struct sk_buff *skb, int len) } shinfo->nr_frags = k; - skb_reset_tail_pointer(skb); skb->data_len -= len; skb->len = skb->data_len; return len; @@ -3273,8 +3271,9 @@ static void tcp_connect_init(struct sock *sk) /* We'll fix this up when we get a response from the other end. * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. */ - tp->tcp_header_len = sizeof(struct tcphdr) + - (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); + tp->tcp_header_len = sizeof(struct tcphdr); + if (sock_net(sk)->ipv4.sysctl_tcp_timestamps) + tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; #ifdef CONFIG_TCP_MD5SIG if (tp->af_specific->md5_lookup(sk, sk)) @@ -3305,7 +3304,7 @@ static void tcp_connect_init(struct sock *sk) tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, &tp->window_clamp, - sysctl_tcp_window_scaling, + sock_net(sk)->ipv4.sysctl_tcp_window_scaling, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index ad99569d4c1e..3330a370d306 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -185,3 +185,4 @@ void tcp_rate_check_app_limited(struct sock *sk) tp->app_limited = (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; } +EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c4a35ba7f8ed..c0feeeef962a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -139,21 +139,17 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) * @timeout: A custom timeout value. * If set to 0 the default timeout is calculated and used. * Using TCP_RTO_MIN and the number of unsuccessful retransmits. - * @syn_set: true if the SYN Bit was set. * * The default "timeout" value this function can calculate and use * is equivalent to the timeout of a TCP Connection * after "boundary" unsuccessful, exponentially backed-off - * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if - * syn_set flag is set. - * + * retransmissions with an initial RTO of TCP_RTO_MIN. */ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, - unsigned int timeout, - bool syn_set) + unsigned int timeout) { - unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN; + const unsigned int rto_base = TCP_RTO_MIN; unsigned int linear_backoff_thresh, start_ts; if (!inet_csk(sk)->icsk_retransmits) @@ -181,8 +177,8 @@ static int tcp_write_timeout(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); + bool expired, do_reset; int retry_until; - bool do_reset, syn_set = false; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) { @@ -196,9 +192,9 @@ static int tcp_write_timeout(struct sock *sk) sk_rethink_txhash(sk); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; - syn_set = true; + expired = icsk->icsk_retransmits >= retry_until; } else { - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable * Fast Open on this path on recurring timeouts after @@ -224,15 +220,15 @@ static int tcp_write_timeout(struct sock *sk) retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || - !retransmits_timed_out(sk, retry_until, 0, 0); + !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } + expired = retransmits_timed_out(sk, retry_until, + icsk->icsk_user_timeout); } - - if (retransmits_timed_out(sk, retry_until, - syn_set ? 0 : icsk->icsk_user_timeout, syn_set)) { + if (expired) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -540,7 +536,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0)) + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); out:; diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c new file mode 100644 index 000000000000..2417f55374c5 --- /dev/null +++ b/net/ipv4/tcp_ulp.c @@ -0,0 +1,135 @@ +/* + * Pluggable TCP upper layer protocol support. + * + * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. + * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. + * + */ + +#include<linux/module.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/gfp.h> +#include <net/tcp.h> + +static DEFINE_SPINLOCK(tcp_ulp_list_lock); +static LIST_HEAD(tcp_ulp_list); + +/* Simple linear search, don't expect many entries! */ +static struct tcp_ulp_ops *tcp_ulp_find(const char *name) +{ + struct tcp_ulp_ops *e; + + list_for_each_entry_rcu(e, &tcp_ulp_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name) +{ + const struct tcp_ulp_ops *ulp = NULL; + + rcu_read_lock(); + ulp = tcp_ulp_find(name); + +#ifdef CONFIG_MODULES + if (!ulp && capable(CAP_NET_ADMIN)) { + rcu_read_unlock(); + request_module("%s", name); + rcu_read_lock(); + ulp = tcp_ulp_find(name); + } +#endif + if (!ulp || !try_module_get(ulp->owner)) + ulp = NULL; + + rcu_read_unlock(); + return ulp; +} + +/* Attach new upper layer protocol to the list + * of available protocols. + */ +int tcp_register_ulp(struct tcp_ulp_ops *ulp) +{ + int ret = 0; + + spin_lock(&tcp_ulp_list_lock); + if (tcp_ulp_find(ulp->name)) { + pr_notice("%s already registered or non-unique name\n", + ulp->name); + ret = -EEXIST; + } else { + list_add_tail_rcu(&ulp->list, &tcp_ulp_list); + } + spin_unlock(&tcp_ulp_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_register_ulp); + +void tcp_unregister_ulp(struct tcp_ulp_ops *ulp) +{ + spin_lock(&tcp_ulp_list_lock); + list_del_rcu(&ulp->list); + spin_unlock(&tcp_ulp_list_lock); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(tcp_unregister_ulp); + +/* Build string with list of available upper layer protocl values */ +void tcp_get_available_ulp(char *buf, size_t maxlen) +{ + struct tcp_ulp_ops *ulp_ops; + size_t offs = 0; + + *buf = '\0'; + rcu_read_lock(); + list_for_each_entry_rcu(ulp_ops, &tcp_ulp_list, list) { + offs += snprintf(buf + offs, maxlen - offs, + "%s%s", + offs == 0 ? "" : " ", ulp_ops->name); + } + rcu_read_unlock(); +} + +void tcp_cleanup_ulp(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (!icsk->icsk_ulp_ops) + return; + + if (icsk->icsk_ulp_ops->release) + icsk->icsk_ulp_ops->release(sk); + module_put(icsk->icsk_ulp_ops->owner); +} + +/* Change upper layer protocol for socket */ +int tcp_set_ulp(struct sock *sk, const char *name) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_ulp_ops *ulp_ops; + int err = 0; + + if (icsk->icsk_ulp_ops) + return -EEXIST; + + ulp_ops = __tcp_ulp_find_autoload(name); + if (!ulp_ops) + err = -ENOENT; + else + err = ulp_ops->init(sk); + + if (err) + goto out; + + icsk->icsk_ulp_ops = ulp_ops; + out: + return err; +} diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fdcb7437cc15..86fad2a14ac4 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1163,6 +1163,35 @@ out: return ret; } +#if BITS_PER_LONG == 64 +static void udp_set_dev_scratch(struct sk_buff *skb) +{ + struct udp_dev_scratch *scratch; + + BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long)); + scratch = (struct udp_dev_scratch *)&skb->dev_scratch; + scratch->truesize = skb->truesize; + scratch->len = skb->len; + scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); + scratch->is_linear = !skb_is_nonlinear(skb); +} + +static int udp_skb_truesize(struct sk_buff *skb) +{ + return ((struct udp_dev_scratch *)&skb->dev_scratch)->truesize; +} +#else +static void udp_set_dev_scratch(struct sk_buff *skb) +{ + skb->dev_scratch = skb->truesize; +} + +static int udp_skb_truesize(struct sk_buff *skb) +{ + return skb->dev_scratch; +} +#endif + /* fully reclaim rmem/fwd memory allocated for skb */ static void udp_rmem_release(struct sock *sk, int size, int partial, bool rx_queue_lock_held) @@ -1213,14 +1242,16 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, */ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) { - udp_rmem_release(sk, skb->dev_scratch, 1, false); + prefetch(&skb->data); + udp_rmem_release(sk, udp_skb_truesize(skb), 1, false); } EXPORT_SYMBOL(udp_skb_destructor); /* as above, but the caller held the rx queue lock, too */ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) { - udp_rmem_release(sk, skb->dev_scratch, 1, true); + prefetch(&skb->data); + udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } /* Idea of busylocks is to let producers grab an extra spinlock @@ -1274,10 +1305,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) busy = busylock_acquire(sk); } size = skb->truesize; - /* Copy skb->truesize into skb->dev_scratch to avoid a cache line miss - * in udp_skb_destructor() - */ - skb->dev_scratch = size; + udp_set_dev_scratch(skb); /* we drop only if the receive buf is full and the receive * queue contains some other skb @@ -1359,7 +1387,8 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) sk_peek_offset_bwd(sk, len); unlock_sock_fast(sk, slow); } - consume_skb(skb); + + consume_stateless_skb(skb); } EXPORT_SYMBOL_GPL(skb_consume_udp); @@ -1369,16 +1398,23 @@ static struct sk_buff *__first_packet_length(struct sock *sk, { struct sk_buff *skb; - while ((skb = skb_peek(rcvq)) != NULL && - udp_lib_checksum_complete(skb)) { - __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, - IS_UDPLITE(sk)); - __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - atomic_inc(&sk->sk_drops); - __skb_unlink(skb, rcvq); - *total += skb->truesize; - kfree_skb(skb); + while ((skb = skb_peek(rcvq)) != NULL) { + if (udp_lib_checksum_complete(skb)) { + __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, + IS_UDPLITE(sk)); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + atomic_inc(&sk->sk_drops); + __skb_unlink(skb, rcvq); + *total += skb->truesize; + kfree_skb(skb); + } else { + /* the csum related bits could be changed, refresh + * the scratch area + */ + udp_set_dev_scratch(skb); + break; + } } return skb; } @@ -1540,7 +1576,7 @@ try_again: if (!skb) return err; - ulen = skb->len; + ulen = udp_skb_len(skb); copied = len; if (copied > ulen - off) copied = ulen - off; @@ -1555,14 +1591,18 @@ try_again: if (copied < ulen || peeking || (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { - checksum_valid = !udp_lib_checksum_complete(skb); + checksum_valid = udp_skb_csum_unnecessary(skb) || + !__udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; } - if (checksum_valid || skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, off, msg, copied); - else { + if (checksum_valid || udp_skb_csum_unnecessary(skb)) { + if (udp_skb_is_linear(skb)) + err = copy_linear_skb(skb, copied, off, &msg->msg_iter); + else + err = skb_copy_datagram_msg(skb, off, msg, copied); + } else { err = skb_copy_and_csum_datagram_msg(skb, off, msg); if (err == -EINVAL) @@ -1739,6 +1779,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id_once(sk, skb); } + /* clear all pending head states while they are hot in the cache */ + skb_release_head_state(skb); + rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); @@ -1853,6 +1896,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } + prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) goto csum_error; @@ -1881,9 +1925,10 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; - dst_hold(dst); - old = xchg(&sk->sk_rx_dst, dst); - dst_release(old); + if (dst_hold_safe(dst)) { + old = xchg(&sk->sk_rx_dst, dst); + dst_release(old); + } } /* @@ -2207,13 +2252,11 @@ void udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { - /* DST_NOCACHE can not be used without taking a reference */ - if (dst->flags & DST_NOCACHE) { - if (likely(atomic_inc_not_zero(&dst->__refcnt))) - skb_dst_set(skb, dst); - } else { - skb_dst_set_noref(skb, dst); - } + /* set noref for now. + * any place which wants to hold dst has to call + * dst_hold_safe() + */ + skb_dst_set_noref(skb, dst); } } diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 71acd0014f2d..856d2dfdb44b 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -57,8 +57,7 @@ static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb) xfrm4_beet_make_header(skb); - ph = (struct ip_beet_phdr *) - __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen); + ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen); top_iph = ip_hdr(skb); |