diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/Kconfig | 10 | ||||
-rw-r--r-- | net/ipv4/af_inet.c | 36 | ||||
-rw-r--r-- | net/ipv4/datagram.c | 4 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 4 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 10 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 6 | ||||
-rw-r--r-- | net/ipv4/igmp.c | 6 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 2 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 96 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 15 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/ipt_CLUSTERIP.c | 4 | ||||
-rw-r--r-- | net/ipv4/netfilter/ipt_rpfilter.c | 3 | ||||
-rw-r--r-- | net/ipv4/netfilter/nft_fib_ipv4.c | 3 | ||||
-rw-r--r-- | net/ipv4/nexthop.c | 2 | ||||
-rw-r--r-- | net/ipv4/ping.c | 23 | ||||
-rw-r--r-- | net/ipv4/route.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 16 | ||||
-rw-r--r-- | net/ipv4/tcp_bpf.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp_cdg.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 29 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp_ulp.c | 3 | ||||
-rw-r--r-- | net/ipv4/udp.c | 14 | ||||
-rw-r--r-- | net/ipv4/udp_bpf.c | 4 | ||||
-rw-r--r-- | net/ipv4/udplite.c | 8 |
27 files changed, 212 insertions, 120 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e983bb0c5012..2dfb12230f08 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -402,6 +402,16 @@ config INET_IPCOMP If unsure, say Y. +config INET_TABLE_PERTURB_ORDER + int "INET: Source port perturbation table size (as power of 2)" if EXPERT + default 16 + help + Source port perturbation table size (as power of 2) for + RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm. + + The default is almost always what you want. + Only change this if you know what you are doing. + config INET_XFRM_TUNNEL tristate select INET_TUNNEL diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e2c219382345..0da679411330 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -558,22 +558,27 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; + + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (uaddr->sa_family == AF_UNSPEC) - return sk->sk_prot->disconnect(sk, flags); + return prot->disconnect(sk, flags); if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { - err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + err = prot->pre_connect(sk, uaddr, addr_len); if (err) return err; } if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->connect(sk, uaddr, addr_len); + return prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); @@ -734,10 +739,11 @@ EXPORT_SYMBOL(inet_stream_connect); int inet_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { - struct sock *sk1 = sock->sk; + struct sock *sk1 = sock->sk, *sk2; int err = -EINVAL; - struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern); if (!sk2) goto do_err; @@ -748,6 +754,8 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags, (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_CLOSE_WAIT | TCPF_CLOSE))); + if (test_bit(SOCK_SUPPORT_ZC, &sock->flags)) + set_bit(SOCK_SUPPORT_ZC, &newsock->flags); sock_graft(sk2, newsock); newsock->state = SS_CONNECTED; @@ -825,12 +833,15 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; - if (sk->sk_prot->sendpage) - return sk->sk_prot->sendpage(sk, page, offset, size, flags); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (prot->sendpage) + return prot->sendpage(sk, page, offset, size, flags); return sock_no_sendpage(sock, page, offset, size, flags); } EXPORT_SYMBOL(inet_sendpage); @@ -1219,7 +1230,6 @@ EXPORT_SYMBOL(inet_unregister_protosw); static int inet_sk_reselect_saddr(struct sock *sk) { - struct inet_bind_hashbucket *prev_addr_hashbucket; struct inet_sock *inet = inet_sk(sk); __be32 old_saddr = inet->inet_saddr; __be32 daddr = inet->inet_daddr; @@ -1249,16 +1259,8 @@ static int inet_sk_reselect_saddr(struct sock *sk) return 0; } - prev_addr_hashbucket = - inet_bhashfn_portaddr(tcp_or_dccp_get_hashinfo(sk), sk, - sock_net(sk), inet->inet_num); - - inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; - - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET); if (err) { - inet->inet_saddr = old_saddr; - inet->inet_rcv_saddr = old_saddr; ip_rt_put(rt); return err; } diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 405a8c2aea64..4d1af0cd7d99 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -70,10 +70,10 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len } inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; - reuseport_has_conns(sk, true); + reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); - inet->inet_id = prandom_u32(); + inet->inet_id = get_random_u16(); sk_dst_set(sk, &rt->dst); err = 0; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 943edf4ad4db..f361d3d56be2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -389,7 +389,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { - ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) @@ -401,7 +401,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) - ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; } return ret; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2dc97583d279..f721c308248b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -888,13 +888,13 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, return 1; } + /* cannot match on nexthop object attributes */ + if (fi->nh) + return 1; + if (cfg->fc_oif || cfg->fc_gw_family) { struct fib_nh *nh; - /* cannot match on nexthop object attributes */ - if (fi->nh) - return 1; - nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, @@ -1231,7 +1231,7 @@ static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, nh->fib_nh_dev = in_dev->dev; netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); - nh->fib_nh_scope = RT_SCOPE_LINK; + nh->fib_nh_scope = RT_SCOPE_HOST; if (!netif_carrier_ok(nh->fib_nh_dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 452ff177e4da..74d403dbd2b4 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -126,7 +126,7 @@ struct key_vector { /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; /* This array is valid if (pos | bits) > 0 (TNODE) */ - struct key_vector __rcu *tnode[0]; + DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode); }; }; @@ -1381,8 +1381,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, /* The alias was already inserted, so the node must exist. */ l = l ? l : fib_find_node(t, &tp, key); - if (WARN_ON_ONCE(!l)) + if (WARN_ON_ONCE(!l)) { + err = -ENOENT; goto out_free_new_fa; + } if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index df0660d818ac..81be3e0f0e70 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -213,7 +213,7 @@ static void igmp_stop_timer(struct ip_mc_list *im) /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { - int tv = prandom_u32() % max_delay; + int tv = prandom_u32_max(max_delay); im->tm_running = 1; if (!mod_timer(&im->timer, jiffies+tv+2)) @@ -222,7 +222,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = prandom_u32() % in_dev->mr_maxdelay; + int tv = prandom_u32_max(in_dev->mr_maxdelay); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && @@ -236,7 +236,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev) static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { - int tv = prandom_u32() % delay; + int tv = prandom_u32_max(delay); if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ebca860e113f..4e84ed21d16f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -314,7 +314,7 @@ other_half_scan: if (likely(remaining > 1)) remaining &= ~1U; - offset = prandom_u32() % remaining; + offset = prandom_u32_max(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index a0ad34e4f044..3cec471a2cd2 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -858,34 +858,80 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } -int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk) +static void inet_update_saddr(struct sock *sk, void *saddr, int family) +{ + if (family == AF_INET) { + inet_sk(sk)->inet_saddr = *(__be32 *)saddr; + sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr); + } +#if IS_ENABLED(CONFIG_IPV6) + else { + sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr; + } +#endif +} + +static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); - struct inet_bind_hashbucket *head2; int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + int bhash; + + if (!inet_csk(sk)->icsk_bind2_hash) { + /* Not bind()ed before. */ + if (reset) + inet_reset_saddr(sk); + else + inet_update_saddr(sk, saddr, family); + + return 0; + } /* Allocate a bind2 bucket ahead of time to avoid permanently putting * the bhash2 table in an inconsistent state if a new tb2 bucket * allocation fails. */ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC); - if (!new_tb2) + if (!new_tb2) { + if (reset) { + /* The (INADDR_ANY, port) bucket might have already + * been freed, then we cannot fixup icsk_bind2_hash, + * so we give up and unlink sk from bhash/bhash2 not + * to leave inconsistency in bhash2. + */ + inet_put_port(sk); + inet_reset_saddr(sk); + } + return -ENOMEM; + } + bhash = inet_bhashfn(net, port, hinfo->bhash_size); + head = &hinfo->bhash[bhash]; head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); - if (prev_saddr) { - spin_lock_bh(&prev_saddr->lock); - __sk_del_bind2_node(sk); - inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, - inet_csk(sk)->icsk_bind2_hash); - spin_unlock_bh(&prev_saddr->lock); - } + /* If we change saddr locklessly, another thread + * iterating over bhash might see corrupted address. + */ + spin_lock_bh(&head->lock); - spin_lock_bh(&head2->lock); + spin_lock(&head2->lock); + __sk_del_bind2_node(sk); + inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash); + spin_unlock(&head2->lock); + + if (reset) + inet_reset_saddr(sk); + else + inet_update_saddr(sk, saddr, family); + + head2 = inet_bhashfn_portaddr(hinfo, sk, net, port); + + spin_lock(&head2->lock); tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk); if (!tb2) { tb2 = new_tb2; @@ -893,26 +939,40 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc } sk_add_bind2_node(sk, &tb2->owners); inet_csk(sk)->icsk_bind2_hash = tb2; - spin_unlock_bh(&head2->lock); + spin_unlock(&head2->lock); + + spin_unlock_bh(&head->lock); if (tb2 != new_tb2) kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2); return 0; } + +int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) +{ + return __inet_bhash2_update_saddr(sk, saddr, family, false); +} EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); +void inet_bhash2_reset_saddr(struct sock *sk) +{ + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + __inet_bhash2_update_saddr(sk, NULL, 0, true); +} +EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); + /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. + * * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though - * attacks were since demonstrated, thus we use 65536 instead to really - * give more isolation and privacy, at the expense of 256kB of kernel - * memory. + * attacks were since demonstrated, thus we use 65536 by default instead + * to really give more isolation and privacy, at the expense of 256kB + * of kernel memory. */ -#define INET_TABLE_PERTURB_SHIFT 16 -#define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) +#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER) static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, @@ -1037,7 +1097,7 @@ ok: * on low contention the randomness is maximal and on high contention * it may be inexistent. */ - i = max_t(int, i, (prandom_u32() & 7) * 2); + i = max_t(int, i, prandom_u32_max(8) * 2); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 71d3bb0abf6c..66fc940f9521 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -268,8 +268,21 @@ restart_rcu: rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { - if (sk->sk_state != TCP_TIME_WAIT) + if (sk->sk_state != TCP_TIME_WAIT) { + /* A kernel listener socket might not hold refcnt for net, + * so reqsk_timer_handler() could be fired after net is + * freed. Userspace listener and reqsk never exist here. + */ + if (unlikely(sk->sk_state == TCP_NEW_SYN_RECV && + hashinfo->pernet)) { + struct request_sock *req = inet_reqsk(sk); + + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); + } + continue; + } + tw = inet_twsk(sk); if ((tw->tw_family != family) || refcount_read(&twsk_net(tw)->ns.count)) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 1ae83ad629b2..922c87ef1ab5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -172,7 +172,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, * Avoid using the hashed IP ident generator. */ if (sk->sk_protocol == IPPROTO_TCP) - iph->id = (__force __be16)prandom_u32(); + iph->id = (__force __be16)get_random_u16(); else __ip_select_ident(net, iph, 1); } diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index f8e176c77d1c..b3cc416ed292 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -435,7 +435,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (ctinfo) { case IP_CT_NEW: - ct->mark = hash; + WRITE_ONCE(ct->mark, hash); break; case IP_CT_RELATED: case IP_CT_RELATED_REPLY: @@ -452,7 +452,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) #ifdef DEBUG nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - pr_debug("hash=%u ct_hash=%u ", hash, ct->mark); + pr_debug("hash=%u ct_hash=%u ", hash, READ_ONCE(ct->mark)); if (!clusterip_responsible(cipinfo->config, hash)) { pr_debug("not responsible\n"); return NF_DROP; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 8183bbcabb4a..ded5bef02f77 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -77,7 +77,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; - flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par)); + flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); + flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert; } diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 7ade04ff972d..fc65d69f23e1 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -65,6 +65,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, struct flowi4 fl4 = { .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_uid = sock_net_uid(nft_net(pkt), NULL), }; const struct net_device *oif; const struct net_device *found; @@ -84,7 +85,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, oif = NULL; if (priv->flags & NFTA_FIB_F_IIF) - fl4.flowi4_oif = l3mdev_master_ifindex_rcu(oif); + fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(oif); if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 853a75a8fbaf..d8ef05347fd9 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2534,7 +2534,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, - fib_nh->fib_nh_scope); + !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 517042caf6dc..bde333b24837 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -617,21 +617,9 @@ int ping_getfrag(void *from, char *to, { struct pingfakehdr *pfh = from; - if (offset == 0) { - fraglen -= sizeof(struct icmphdr); - if (fraglen < 0) - BUG(); - if (!csum_and_copy_from_iter_full(to + sizeof(struct icmphdr), - fraglen, &pfh->wcheck, - &pfh->msg->msg_iter)) - return -EFAULT; - } else if (offset < sizeof(struct icmphdr)) { - BUG(); - } else { - if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck, - &pfh->msg->msg_iter)) - return -EFAULT; - } + if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck, + &pfh->msg->msg_iter)) + return -EFAULT; #if IS_ENABLED(CONFIG_IPV6) /* For IPv6, checksum each skb as we go along, as expected by @@ -639,7 +627,7 @@ int ping_getfrag(void *from, char *to, * wcheck, it will be finalized in ping_v4_push_pending_frames. */ if (pfh->family == AF_INET6) { - skb->csum = pfh->wcheck; + skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd); skb->ip_summed = CHECKSUM_NONE; pfh->wcheck = 0; } @@ -842,7 +830,8 @@ back_from_confirm: pfh.family = AF_INET; err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, - 0, &ipc, &rt, msg->msg_flags); + sizeof(struct icmphdr), &ipc, &rt, + msg->msg_flags); if (err) ip_flush_pending_frames(sk); else diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 795cbe1de912..cd1fa9f70f1a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3664,7 +3664,7 @@ static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); - atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); + atomic_set(&net->ipv4.dev_addr_genid, get_random_u32()); return 0; } @@ -3719,7 +3719,7 @@ int __init ip_rt_init(void) ip_idents = idents_hash; - prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); + get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0c51abeee172..4f2205756cfe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); @@ -3113,8 +3114,7 @@ int tcp_disconnect(struct sock *sk, int flags) inet->inet_dport = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) - inet_reset_saddr(sk); + inet_bhash2_reset_saddr(sk); sk->sk_shutdown = 0; sock_reset_flag(sk, SOCK_DONE); @@ -3646,7 +3646,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; - else if (sk->sk_state == TCP_ESTABLISHED) + else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; @@ -3796,8 +3796,9 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) - return icsk->icsk_af_ops->setsockopt(sk, level, optname, - optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, + optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_setsockopt); @@ -4396,8 +4397,9 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) - return icsk->icsk_af_ops->getsockopt(sk, level, optname, - optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, + optval, optlen); return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index a1626afe87a1..cf9c3e8f7ccb 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -278,7 +278,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, { bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; - u32 tosend, delta = 0; + u32 tosend, origsize, sent, delta = 0; u32 eval = __SK_NONE; int ret; @@ -333,10 +333,12 @@ more_data: cork = true; psock->cork = NULL; } - sk_msg_return(sk, msg, msg->sg.size); + sk_msg_return(sk, msg, tosend); release_sock(sk); + origsize = msg->sg.size; ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) sock_put(sk_redir); @@ -375,7 +377,7 @@ more_data: msg->sg.data[msg->sg.start].page_link && msg->sg.data[msg->sg.start].length) { if (eval == __SK_REDIRECT) - sk_mem_charge(sk, msg->sg.size); + sk_mem_charge(sk, tosend - sent); goto more_data; } } @@ -607,7 +609,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) } else { sk->sk_write_space = psock->saved_write_space; /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, psock->sk_proto); + sock_replace_proto(sk, psock->sk_proto); } return 0; } @@ -620,7 +622,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) } /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); + sock_replace_proto(sk, &tcp_bpf_prots[family][config]); return 0; } EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index ddc7ba0554bd..ba4d98e510e0 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -243,7 +243,7 @@ static bool tcp_cdg_backoff(struct sock *sk, u32 grad) struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - if (prandom_u32() <= nexp_u32(grad * backoff_factor)) + if (get_random_u32() <= nexp_u32(grad * backoff_factor)) return false; if (use_ineff) { @@ -375,6 +375,7 @@ static void tcp_cdg_init(struct sock *sk) struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); + ca->gradients = NULL; /* We silently fall back to window = 1 if allocation fails. */ if (window > 1) ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), @@ -388,6 +389,7 @@ static void tcp_cdg_release(struct sock *sk) struct cdg *ca = inet_csk_ca(sk); kfree(ca->gradients); + ca->gradients = NULL; } static struct tcp_congestion_ops tcp_cdg __read_mostly = { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bc2ea12221f9..0640453fce54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2192,7 +2192,8 @@ void tcp_enter_loss(struct sock *sk) */ static bool tcp_check_sack_reneging(struct sock *sk, int flag) { - if (flag & FLAG_SACK_RENEGING) { + if (flag & FLAG_SACK_RENEGING && + flag & FLAG_SND_UNA_ADVANCED) { struct tcp_sock *tp = tcp_sk(sk); unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6376ad915765..da46357f501b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -199,15 +199,14 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { - struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_timewait_death_row *tcp_death_row; - __be32 daddr, nexthop, prev_sk_rcv_saddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct ip_options_rcu *inet_opt; struct net *net = sock_net(sk); __be16 orig_sport, orig_dport; + __be32 daddr, nexthop; struct flowi4 *fl4; struct rtable *rt; int err; @@ -251,24 +250,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!inet->inet_saddr) { - if (inet_csk(sk)->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, - sk, net, inet->inet_num); - prev_sk_rcv_saddr = sk->sk_rcv_saddr; - } - inet->inet_saddr = fl4->saddr; - } - - sk_rcv_saddr_set(sk, inet->inet_saddr); - - if (prev_addr_hashbucket) { - err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk); + err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); if (err) { - inet->inet_saddr = 0; - sk_rcv_saddr_set(sk, prev_sk_rcv_saddr); ip_rt_put(rt); return err; } + } else { + sk_rcv_saddr_set(sk, inet->inet_saddr); } if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { @@ -323,7 +311,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr); } - inet->inet_id = prandom_u32(); + inet->inet_id = get_random_u16(); if (tcp_fastopen_defer_connect(sk, &err)) return err; @@ -343,6 +331,7 @@ failure: * if necessary. */ tcp_set_state(sk, TCP_CLOSE); + inet_bhash2_reset_saddr(sk); ip_rt_put(rt); sk->sk_route_caps = 0; inet->inet_dport = 0; @@ -1543,7 +1532,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - newinet->inet_id = prandom_u32(); + newinet->inet_id = get_random_u16(); /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). @@ -1874,11 +1863,13 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, __skb_push(skb, hdrlen); no_coalesce: + limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); + /* Only socket owner can try to collapse/prune rx queues * to reduce memory overhead, so add a little headroom here. * Few sockets backlog are possibly concurrently non empty. */ - limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024; + limit += 64 * 1024; if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 79f30f026d89..c375f603a16c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -353,13 +353,14 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family) struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { - /* The last refcount is decremented in tcp_sk_exit_batch() */ - if (refcount_read(&net->ipv4.tcp_death_row.tw_refcount) == 1) - continue; - if (net->ipv4.tcp_death_row.hashinfo->pernet) { + /* Even if tw_refcount == 1, we must clean up kernel reqsk */ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); } else if (!purged_once) { + /* The last refcount is decremented in tcp_sk_exit_batch() */ + if (refcount_read(&net->ipv4.tcp_death_row.tw_refcount) == 1) + continue; + inet_twsk_purge(&tcp_hashinfo, family); purged_once = true; } diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 7c27aa629af1..9ae50b1bd844 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -136,6 +136,9 @@ static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops) if (icsk->icsk_ulp_ops) goto out_err; + if (sk->sk_socket) + clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); + err = ulp_ops->init(sk); if (err) goto out_err; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d63118ce5900..6a320a614e54 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rand = prandom_u32(); + rand = get_random_u32(); first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE @@ -448,7 +448,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, result = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk, false)) + if (result && !reuseport_has_conns(sk)) return result; result = result ? : sk; @@ -1598,7 +1598,7 @@ drop: } EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); -void udp_destruct_sock(struct sock *sk) +void udp_destruct_common(struct sock *sk) { /* reclaim completely the forward allocated memory */ struct udp_sock *up = udp_sk(sk); @@ -1611,18 +1611,22 @@ void udp_destruct_sock(struct sock *sk) kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); +} +EXPORT_SYMBOL_GPL(udp_destruct_common); +static void udp_destruct_sock(struct sock *sk) +{ + udp_destruct_common(sk); inet_sock_destruct(sk); } -EXPORT_SYMBOL_GPL(udp_destruct_sock); int udp_init_sock(struct sock *sk) { skb_queue_head_init(&udp_sk(sk)->reader_queue); sk->sk_destruct = udp_destruct_sock; + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); return 0; } -EXPORT_SYMBOL_GPL(udp_init_sock); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) { diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index ff15918b7bdc..e5dc91d0e079 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -141,14 +141,14 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) if (restore) { sk->sk_write_space = psock->saved_write_space; - WRITE_ONCE(sk->sk_prot, psock->sk_proto); + sock_replace_proto(sk, psock->sk_proto); return 0; } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); + sock_replace_proto(sk, &udp_bpf_prots[family]); return 0; } EXPORT_SYMBOL_GPL(udp_bpf_update_proto); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 6e08a76ae1e7..e0c9cc39b81e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -17,6 +17,14 @@ struct udp_table udplite_table __read_mostly; EXPORT_SYMBOL(udplite_table); +/* Designate sk as UDP-Lite socket */ +static int udplite_sk_init(struct sock *sk) +{ + udp_init_sock(sk); + udp_sk(sk)->pcflag = UDPLITE_BIT; + return 0; +} + static int udplite_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); |