diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/bpf_tcp_ca.c | 6 | ||||
-rw-r--r-- | net/ipv4/cipso_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 5 | ||||
-rw-r--r-- | net/ipv4/fou_core.c | 2 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 58 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 2 | ||||
-rw-r--r-- | net/ipv4/inet_timewait_sock.c | 63 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 14 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 10 | ||||
-rw-r--r-- | net/ipv4/metrics.c | 8 | ||||
-rw-r--r-- | net/ipv4/ping.c | 2 | ||||
-rw-r--r-- | net/ipv4/raw.c | 6 | ||||
-rw-r--r-- | net/ipv4/route.c | 16 | ||||
-rw-r--r-- | net/ipv4/syncookies.c | 2 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 80 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 112 | ||||
-rw-r--r-- | net/ipv4/tcp_ao.c | 24 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_fastopen.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 78 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 60 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 31 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 27 | ||||
-rw-r--r-- | net/ipv4/tcp_sigpool.c | 17 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 6 | ||||
-rw-r--r-- | net/ipv4/udp.c | 15 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 8 |
28 files changed, 452 insertions, 222 deletions
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 18227757ec0c..3f88d0961e5b 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -260,17 +260,17 @@ static int bpf_tcp_ca_check_member(const struct btf_type *t, return 0; } -static int bpf_tcp_ca_reg(void *kdata) +static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link) { return tcp_register_congestion_control(kdata); } -static void bpf_tcp_ca_unreg(void *kdata) +static void bpf_tcp_ca_unreg(void *kdata, struct bpf_link *link) { tcp_unregister_congestion_control(kdata); } -static int bpf_tcp_ca_update(void *kdata, void *old_kdata) +static int bpf_tcp_ca_update(void *kdata, void *old_kdata, struct bpf_link *link) { return tcp_update_congestion_control(kdata, old_kdata); } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index e9cb27061c12..8cc0e2f4159d 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1976,7 +1976,7 @@ int cipso_v4_req_setattr(struct request_sock *req, buf = NULL; req_inet = inet_rsk(req); - opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt); + opt = unrcu_pointer(xchg(&req_inet->ireq_opt, RCU_INITIALIZER(opt))); if (opt) kfree_rcu(opt, rcu); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f669da98d11d..7b6b042208bd 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1030,7 +1030,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) bool ecn_ca = false; nla_strscpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); } else { if (nla_len(nla) != sizeof(u32)) return false; @@ -1459,8 +1459,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) goto failure; - fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, - cfg->fc_mx_len, extack); + fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); kfree(fi); diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index a8494f796dca..0abbc413e0fe 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -433,7 +433,7 @@ next_proto: offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); - if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) + if (!ops || !ops->callbacks.gro_receive) goto out; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d4f0eff8b20f..64d07b842e73 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -911,6 +911,64 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); +static struct request_sock * +reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, + bool attach_listener) +{ + struct request_sock *req; + + req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN); + if (!req) + return NULL; + req->rsk_listener = NULL; + if (attach_listener) { + if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) { + kmem_cache_free(ops->slab, req); + return NULL; + } + req->rsk_listener = sk_listener; + } + req->rsk_ops = ops; + req_to_sk(req)->sk_prot = sk_listener->sk_prot; + sk_node_init(&req_to_sk(req)->sk_node); + sk_tx_queue_clear(req_to_sk(req)); + req->saved_syn = NULL; + req->syncookie = 0; + req->timeout = 0; + req->num_timeout = 0; + req->num_retrans = 0; + req->sk = NULL; + refcount_set(&req->rsk_refcnt, 0); + + return req; +} +#define reqsk_alloc(...) alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__)) + +struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener) +{ + struct request_sock *req = reqsk_alloc(ops, sk_listener, + attach_listener); + + if (req) { + struct inet_request_sock *ireq = inet_rsk(req); + + ireq->ireq_opt = NULL; +#if IS_ENABLED(CONFIG_IPV6) + ireq->pktopts = NULL; +#endif + atomic64_set(&ireq->ir_cookie, 0); + ireq->ireq_state = TCP_NEW_SYN_RECV; + write_pnet(&ireq->ireq_net, sock_net(sk_listener)); + ireq->ireq_family = sk_listener->sk_family; + req->timeout = TCP_TIMEOUT_INIT; + } + + return req; +} +EXPORT_SYMBOL(inet_reqsk_alloc); + static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index faaec92a46ac..d179a2c84222 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -619,7 +619,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, skb_mark_not_on_list(head); head->prev = NULL; head->tstamp = q->stamp; - head->mono_delivery_time = q->mono_delivery_time; + head->tstamp_type = q->tstamp_type; if (sk) refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index e28075f0006e..337390ba85b4 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -92,13 +92,22 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, hlist_nulls_add_head_rcu(&tw->tw_node, list); } +static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) +{ + __inet_twsk_schedule(tw, timeo, false); +} + /* - * Enter the time wait state. This is called with locally disabled BH. + * Enter the time wait state. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. + * + * The caller must not access @tw anymore after this function returns. */ -void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, - struct inet_hashinfo *hashinfo) +void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, + struct sock *sk, + struct inet_hashinfo *hashinfo, + int timeo) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -114,6 +123,7 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, hashinfo->bhash_size)]; bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num); + local_bh_disable(); spin_lock(&bhead->lock); spin_lock(&bhead2->lock); @@ -129,26 +139,34 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, spin_lock(lock); + /* Step 2: Hash TW into tcp ehash chain */ inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - spin_unlock(lock); + /* Ensure above writes are committed into memory before updating the + * refcount. + * Provides ordering vs later refcount_inc(). + */ + smp_wmb(); /* tw_refcnt is set to 3 because we have : * - one reference for bhash chain. * - one reference for ehash chain. * - one reference for timer. - * We can use atomic_set() because prior spin_lock()/spin_unlock() - * committed into memory all tw fields. * Also note that after this point, we lost our implicit reference * so we are not allowed to use tw anymore. */ refcount_set(&tw->tw_refcnt, 3); + + inet_twsk_schedule(tw, timeo); + + spin_unlock(lock); + local_bh_enable(); } -EXPORT_SYMBOL_GPL(inet_twsk_hashdance); +EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { @@ -192,7 +210,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); - timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED); + timer_setup(&tw->tw_timer, tw_timer_handler, 0); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this @@ -217,7 +235,34 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc); */ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { - if (del_timer_sync(&tw->tw_timer)) + struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; + spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); + + /* inet_twsk_purge() walks over all sockets, including tw ones, + * and removes them via inet_twsk_deschedule_put() after a + * refcount_inc_not_zero(). + * + * inet_twsk_hashdance_schedule() must (re)init the refcount before + * arming the timer, i.e. inet_twsk_purge can obtain a reference to + * a twsk that did not yet schedule the timer. + * + * The ehash lock synchronizes these two: + * After acquiring the lock, the timer is always scheduled (else + * timer_shutdown returns false), because hashdance_schedule releases + * the ehash lock only after completing the timer initialization. + * + * Without grabbing the ehash lock, we get: + * 1) cpu x sets twsk refcount to 3 + * 2) cpu y bumps refcount to 4 + * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down + * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown + * -> timer refcount is never decremented. + */ + spin_lock(lock); + /* Makes sure hashdance_schedule() has completed */ + spin_unlock(lock); + + if (timer_shutdown_sync(&tw->tw_timer)) inet_twsk_kill(tw); inet_twsk_put(tw); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 08e2c92e25ab..a92664a5ef2e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -355,7 +355,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->iif = dev->ifindex; qp->q.stamp = skb->tstamp; - qp->q.mono_delivery_time = skb->mono_delivery_time; + qp->q.tstamp_type = skb->tstamp_type; qp->q.meat += skb->len; qp->ecn |= ecn; add_frag_mem_limit(qp->q.fqdir, skb->truesize); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 9500031a1f55..b90d0f78ac80 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -764,7 +764,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, { struct iphdr *iph; struct sk_buff *skb2; - bool mono_delivery_time = skb->mono_delivery_time; + u8 tstamp_type = skb->tstamp_type; struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; @@ -856,7 +856,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, } } - skb_set_delivery_time(skb, tstamp, mono_delivery_time); + skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) @@ -912,7 +912,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - skb_set_delivery_time(skb2, tstamp, mono_delivery_time); + skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, skb2); if (err) goto fail; @@ -1457,7 +1457,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk, skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority); skb->mark = cork->mark; - skb->tstamp = cork->transmit_time; + if (sk_is_tcp(sk)) + skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC); + else + skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid); /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount @@ -1649,7 +1652,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - nskb->mono_delivery_time = !!transmit_time; + if (transmit_time) + nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4); ip_push_pending_frames(sk, &fl4); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index bccef2fcf620..5cffad42fe8c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1099,7 +1099,6 @@ static void ip_tunnel_dev_free(struct net_device *dev) gro_cells_destroy(&tunnel->gro_cells); dst_cache_destroy(&tunnel->dst_cache); - free_percpu(dev->tstats); } void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) @@ -1313,20 +1312,15 @@ int ip_tunnel_init(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = ip_tunnel_dev_free; - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); - if (err) { - free_percpu(dev->tstats); + if (err) return err; - } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { dst_cache_destroy(&tunnel->dst_cache); - free_percpu(dev->tstats); return err; } diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 0e3ee1532848..8ddac1f595ed 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -7,7 +7,7 @@ #include <net/net_namespace.h> #include <net/tcp.h> -static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, +static int ip_metrics_convert(struct nlattr *fc_mx, int fc_mx_len, u32 *metrics, struct netlink_ext_ack *extack) { @@ -31,7 +31,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, char tmp[TCP_CA_NAME_MAX]; nla_strscpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) { NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); return -EINVAL; @@ -63,7 +63,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, return 0; } -struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, +struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, struct netlink_ext_ack *extack) { @@ -77,7 +77,7 @@ struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, if (unlikely(!fib_metrics)) return ERR_PTR(-ENOMEM); - err = ip_metrics_convert(net, fc_mx, fc_mx_len, fib_metrics->metrics, + err = ip_metrics_convert(fc_mx, fc_mx_len, fib_metrics->metrics, extack); if (!err) { refcount_set(&fib_metrics->refcnt, 1); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 823306487a82..619ddc087957 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -946,7 +946,7 @@ static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk, pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n", inet_sk(sk), inet_sk(sk)->inet_num, skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); pr_debug("ping_queue_rcv_skb -> failed\n"); return reason; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4cb43401e0e0..474dfd263c8b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -301,7 +301,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) ipv4_pktinfo_prepare(sk, skb, true); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return NET_RX_DROP; } @@ -312,7 +312,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); @@ -360,7 +360,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->protocol = htons(ETH_P_IP); skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc->mark; - skb->tstamp = sockc->transmit_time; + skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_dst_set(skb, &rt->dst); *rtp = NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b3073d1c8f8f..54512acbead7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1481,7 +1481,6 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) struct uncached_list { spinlock_t lock; struct list_head head; - struct list_head quarantine; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); @@ -1532,7 +1531,7 @@ void rt_flush_dev(struct net_device *dev) rt->dst.dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); - list_move(&rt->dst.rt_uncached, &ul->quarantine); + list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } @@ -1924,7 +1923,7 @@ static u32 fib_multipath_custom_hash_outer(const struct net *net, hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_inner(const struct net *net, @@ -1973,7 +1972,7 @@ static u32 fib_multipath_custom_hash_inner(const struct net *net, if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_skb(const struct net *net, @@ -2010,7 +2009,7 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net, if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl4 can be NULL */ @@ -2031,7 +2030,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: /* skb is currently provided only when forwarding */ @@ -2065,7 +2064,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -2096,7 +2095,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) @@ -3661,7 +3660,6 @@ int __init ip_rt_init(void) struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); - INIT_LIST_HEAD(&ul->quarantine); spin_lock_init(&ul->lock); } #ifdef CONFIG_IP_ROUTE_CLASSID diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index b61d36810fe3..1948d15f1f28 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -496,6 +496,6 @@ out: out_free: reqsk_free(req); out_drop: - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); return NULL; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 162a0a3b6ba5..9140d20eb2d4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -130,7 +130,8 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write, return ret; } -static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) +static void inet_get_ping_group_range_table(const struct ctl_table *table, + kgid_t *low, kgid_t *high) { kgid_t *data = table->data; struct net *net = @@ -145,7 +146,8 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low } /* Update system visible IP port range */ -static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high) +static void set_ping_group_range(const struct ctl_table *table, + kgid_t low, kgid_t high) { kgid_t *data = table->data; struct net *net = @@ -462,6 +464,61 @@ static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write, return ret; } + +static u32 proc_fib_multipath_hash_rand_seed __ro_after_init; + +static void proc_fib_multipath_hash_init_rand_seed(void) +{ + get_random_bytes(&proc_fib_multipath_hash_rand_seed, + sizeof(proc_fib_multipath_hash_rand_seed)); +} + +static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) +{ + struct sysctl_fib_multipath_hash_seed new = { + .user_seed = user_seed, + .mp_seed = (user_seed ? user_seed : + proc_fib_multipath_hash_rand_seed), + }; + + WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new); +} + +static int proc_fib_multipath_hash_seed(struct ctl_table *table, int write, + void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct sysctl_fib_multipath_hash_seed *mphs; + struct net *net = table->data; + struct ctl_table tmp; + u32 user_seed; + int ret; + + mphs = &net->ipv4.sysctl_fib_multipath_hash_seed; + user_seed = mphs->user_seed; + + tmp = *table; + tmp.data = &user_seed; + + ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) { + proc_fib_multipath_hash_set_seed(net, user_seed); + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); + } + + return ret; +} +#else + +static void proc_fib_multipath_hash_init_rand_seed(void) +{ +} + +static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed) +{ +} + #endif static struct ctl_table ipv4_table[] = { @@ -1070,6 +1127,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &fib_multipath_hash_fields_all_mask, }, + { + .procname = "fib_multipath_hash_seed", + .data = &init_net, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_fib_multipath_hash_seed, + }, #endif { .procname = "ip_unprivileged_port_start", @@ -1501,6 +1565,14 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "tcp_rto_min_us", + .data = &init_net.ipv4.sysctl_tcp_rto_min_us, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, }; static __net_init int ipv4_sysctl_init_net(struct net *net) @@ -1540,6 +1612,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; + proc_fib_multipath_hash_set_seed(net, 0); + return 0; err_ports: @@ -1574,6 +1648,8 @@ static __init int sysctl_ipv4_init(void) if (!hdr) return -ENOMEM; + proc_fib_multipath_hash_init_rand_seed(); + if (register_pernet_subsys(&ipv4_sysctl_ops)) { unregister_net_sysctl_table(hdr); return -ENOMEM; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e6790ea74877..e03a342c9162 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -282,6 +282,7 @@ #include <asm/ioctls.h> #include <net/busy_poll.h> #include <net/hotdata.h> +#include <trace/events/tcp.h> #include <net/rps.h> /* Track pending CMSGs. */ @@ -420,6 +421,7 @@ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + int rto_min_us; tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; @@ -428,7 +430,8 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; - icsk->icsk_rto_min = TCP_RTO_MIN; + rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); + icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); @@ -598,7 +601,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) */ mask |= EPOLLOUT | EPOLLWRNORM; } - /* This barrier is coupled with smp_wmb() in tcp_reset() */ + /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */ smp_rmb(); if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) @@ -3086,7 +3089,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); - dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); + dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL))); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; @@ -4461,7 +4464,7 @@ int tcp_md5_hash_key(struct tcp_sigpool *hp, EXPORT_SYMBOL(tcp_md5_hash_key); /* Called with rcu_read_lock() */ -enum skb_drop_reason +static enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int l3index, const __u8 *hash_location) @@ -4481,7 +4484,7 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, if (!key && hash_location) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); - tcp_hash_fail("Unexpected MD5 Hash found", family, skb, ""); + trace_tcp_hash_md5_unexpected(sk, skb); return SKB_DROP_REASON_TCP_MD5UNEXPECTED; } @@ -4496,29 +4499,90 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - if (family == AF_INET) { - tcp_hash_fail("MD5 Hash failed", AF_INET, skb, "%s L3 index %d", - genhash ? "tcp_v4_calc_md5_hash failed" - : "", l3index); - } else { - if (genhash) { - tcp_hash_fail("MD5 Hash failed", - AF_INET6, skb, "L3 index %d", - l3index); - } else { - tcp_hash_fail("MD5 Hash mismatch", - AF_INET6, skb, "L3 index %d", - l3index); - } - } + trace_tcp_hash_md5_mismatch(sk, skb); return SKB_DROP_REASON_TCP_MD5FAILURE; } return SKB_NOT_DROPPED_YET; } -EXPORT_SYMBOL(tcp_inbound_md5_hash); +#else +static inline enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int l3index, const __u8 *hash_location) +{ + return SKB_NOT_DROPPED_YET; +} #endif +/* Called with rcu_read_lock() */ +enum skb_drop_reason +tcp_inbound_hash(struct sock *sk, const struct request_sock *req, + const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct tcp_ao_hdr *aoh; + const __u8 *md5_location; + int l3index; + + /* Invalid option or two times meet any of auth options */ + if (tcp_parse_auth_options(th, &md5_location, &aoh)) { + trace_tcp_hash_bad_header(sk, skb); + return SKB_DROP_REASON_TCP_AUTH_HDR; + } + + if (req) { + if (tcp_rsk_used_ao(req) != !!aoh) { + u8 keyid, rnext, maclen; + + if (aoh) { + keyid = aoh->keyid; + rnext = aoh->rnext_keyid; + maclen = tcp_ao_hdr_maclen(aoh); + } else { + keyid = rnext = maclen = 0; + } + + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); + trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen); + return SKB_DROP_REASON_TCP_AOFAILURE; + } + } + + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + /* Fast path: unsigned segments */ + if (likely(!md5_location && !aoh)) { + /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid + * for the remote peer. On TCP-AO established connection + * the last key is impossible to remove, so there's + * always at least one current_key. + */ + if (tcp_ao_required(sk, saddr, family, l3index, true)) { + trace_tcp_hash_ao_required(sk, skb); + return SKB_DROP_REASON_TCP_AONOTFOUND; + } + if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + trace_tcp_hash_md5_required(sk, skb); + return SKB_DROP_REASON_TCP_MD5NOTFOUND; + } + return SKB_NOT_DROPPED_YET; + } + + if (aoh) + return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh); + + return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family, + l3index, md5_location); +} +EXPORT_SYMBOL_GPL(tcp_inbound_hash); + void tcp_done(struct sock *sk) { struct request_sock *req; @@ -4583,14 +4647,10 @@ int tcp_abort(struct sock *sk, int err) bh_lock_sock(sk); if (!sock_flag(sk, SOCK_DEAD)) { - WRITE_ONCE(sk->sk_err, err); - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); - tcp_done(sk); + tcp_done_with_error(sk, err); } bh_unlock_sock(sk); diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 09c0fa6756b7..85531437890c 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -16,6 +16,7 @@ #include <net/tcp.h> #include <net/ipv6.h> #include <net/icmp.h> +#include <trace/events/tcp.h> DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_ao_needed, HZ); @@ -884,17 +885,16 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, const struct tcp_ao_hdr *aoh, struct tcp_ao_key *key, u8 *traffic_key, u8 *phash, u32 sne, int l3index) { - u8 maclen = aoh->length - sizeof(struct tcp_ao_hdr); const struct tcphdr *th = tcp_hdr(skb); + u8 maclen = tcp_ao_hdr_maclen(aoh); void *hash_buf = NULL; if (maclen != tcp_ao_maclen(key)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); atomic64_inc(&info->counters.pkt_bad); atomic64_inc(&key->pkt_bad); - tcp_hash_fail("AO hash wrong length", family, skb, - "%u != %d L3index: %d", maclen, - tcp_ao_maclen(key), l3index); + trace_tcp_ao_wrong_maclen(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); return SKB_DROP_REASON_TCP_AOFAILURE; } @@ -909,8 +909,8 @@ tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb, NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); atomic64_inc(&info->counters.pkt_bad); atomic64_inc(&key->pkt_bad); - tcp_hash_fail("AO hash mismatch", family, skb, - "L3index: %d", l3index); + trace_tcp_ao_mismatch(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); kfree(hash_buf); return SKB_DROP_REASON_TCP_AOFAILURE; } @@ -927,6 +927,7 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, int l3index, const struct tcp_ao_hdr *aoh) { const struct tcphdr *th = tcp_hdr(skb); + u8 maclen = tcp_ao_hdr_maclen(aoh); u8 *phash = (u8 *)(aoh + 1); /* hash goes just after the header */ struct tcp_ao_info *info; enum skb_drop_reason ret; @@ -939,8 +940,8 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, info = rcu_dereference(tcp_sk(sk)->ao_info); if (!info) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND); - tcp_hash_fail("AO key not found", family, skb, - "keyid: %u L3index: %d", aoh->keyid, l3index); + trace_tcp_ao_key_not_found(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); return SKB_DROP_REASON_TCP_AOUNEXPECTED; } @@ -981,6 +982,9 @@ tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb, current_key = READ_ONCE(info->current_key); /* Key rotation: the peer asks us to use new key (RNext) */ if (unlikely(aoh->rnext_keyid != current_key->sndid)) { + trace_tcp_ao_rnext_request(sk, skb, current_key->sndid, + aoh->rnext_keyid, + tcp_ao_hdr_maclen(aoh)); /* If the key is not found we do nothing. */ key = tcp_ao_established_key(info, aoh->rnext_keyid, -1); if (key) @@ -1046,8 +1050,8 @@ verify_hash: key_not_found: NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND); atomic64_inc(&info->counters.key_not_found); - tcp_hash_fail("Requested by the peer AO key id not found", - family, skb, "L3index: %d", l3index); + trace_tcp_ao_key_not_found(sk, skb, aoh->keyid, + aoh->rnext_keyid, maclen); return SKB_DROP_REASON_TCP_AOKEYNOTFOUND; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 28ffcfbeef14..48617d99abb0 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -46,8 +46,7 @@ void tcp_set_ca_state(struct sock *sk, const u8 ca_state) } /* Must be called with rcu lock held */ -static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, - const char *name) +static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name) { struct tcp_congestion_ops *ca = tcp_ca_find(name); @@ -178,7 +177,7 @@ int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_cong return ret; } -u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; @@ -186,7 +185,7 @@ u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) might_sleep(); rcu_read_lock(); - ca = tcp_ca_find_autoload(net, name); + ca = tcp_ca_find_autoload(name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; @@ -283,7 +282,7 @@ int tcp_set_default_congestion_control(struct net *net, const char *name) int ret; rcu_read_lock(); - ca = tcp_ca_find_autoload(net, name); + ca = tcp_ca_find_autoload(name); if (!ca) { ret = -ENOENT; } else if (!bpf_try_module_get(ca, ca->owner)) { @@ -421,7 +420,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, if (!load) ca = tcp_ca_find(name); else - ca = tcp_ca_find_autoload(sock_net(sk), name); + ca = tcp_ca_find_autoload(name); /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8ed54e7334a9..0f523cbfe329 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -49,7 +49,7 @@ void tcp_fastopen_ctx_destroy(struct net *net) { struct tcp_fastopen_context *ctxt; - ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL); + ctxt = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, NULL)); if (ctxt) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); @@ -80,9 +80,10 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; - octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx); + octx = unrcu_pointer(xchg(&q->ctx, RCU_INITIALIZER(ctx))); } else { - octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx); + octx = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, + RCU_INITIALIZER(ctx))); } if (octx) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 38da23f991d6..e0f54b9be850 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3604,8 +3604,10 @@ static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack) ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held((struct sock *)tp)); - if (ao && ack < tp->snd_una) + if (ao && ack < tp->snd_una) { ao->snd_sne++; + trace_tcp_ao_snd_sne_update((struct sock *)tp, ao->snd_sne); + } #endif } @@ -3630,8 +3632,10 @@ static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq) ao = rcu_dereference_protected(tp->ao_info, lockdep_sock_is_held((struct sock *)tp)); - if (ao && seq < tp->rcv_nxt) + if (ao && seq < tp->rcv_nxt) { ao->rcv_sne++; + trace_tcp_ao_rcv_sne_update((struct sock *)tp, ao->rcv_sne); + } #endif } @@ -4469,9 +4473,26 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, return SKB_NOT_DROPPED_YET; } + +void tcp_done_with_error(struct sock *sk, int err) +{ + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + WRITE_ONCE(sk->sk_err, err); + smp_wmb(); + + tcp_write_queue_purge(sk); + tcp_done(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + sk_error_report(sk); +} +EXPORT_SYMBOL(tcp_done_with_error); + /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) { + int err; + trace_tcp_receive_reset(sk); /* mptcp can't tell us to ignore reset pkts, @@ -4483,24 +4504,17 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb) /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->sk_state) { case TCP_SYN_SENT: - WRITE_ONCE(sk->sk_err, ECONNREFUSED); + err = ECONNREFUSED; break; case TCP_CLOSE_WAIT: - WRITE_ONCE(sk->sk_err, EPIPE); + err = EPIPE; break; case TCP_CLOSE: return; default: - WRITE_ONCE(sk->sk_err, ECONNRESET); + err = ECONNRESET; } - /* This barrier is coupled with smp_rmb() in tcp_poll() */ - smp_wmb(); - - tcp_write_queue_purge(sk); - tcp_done(sk); - - if (!sock_flag(sk, SOCK_DEAD)) - sk_error_report(sk); + tcp_done_with_error(sk, err); } /* @@ -4836,10 +4850,7 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; - if (!mptcp_skb_can_collapse(to, from)) - return false; - - if (skb_cmp_decrypted(from, to)) + if (!tcp_skb_can_collapse_rx(to, from)) return false; if (!skb_try_coalesce(to, from, fragstolen, &delta)) @@ -4882,7 +4893,7 @@ static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { sk_drops_add(sk, skb); - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); } /* This one checks to see if we can put data from the @@ -5395,7 +5406,7 @@ restart: break; } - if (n && n != tail && mptcp_skb_can_collapse(skb, n) && + if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; @@ -5446,11 +5457,9 @@ restart: skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || - !mptcp_skb_can_collapse(nskb, skb) || + !tcp_skb_can_collapse_rx(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; - if (skb_cmp_decrypted(skb, nskb)) - goto end; } } } @@ -7014,31 +7023,6 @@ static void tcp_openreq_init(struct request_sock *req, #endif } -struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk_listener, - bool attach_listener) -{ - struct request_sock *req = reqsk_alloc(ops, sk_listener, - attach_listener); - - if (req) { - struct inet_request_sock *ireq = inet_rsk(req); - - ireq->ireq_opt = NULL; -#if IS_ENABLED(CONFIG_IPV6) - ireq->pktopts = NULL; -#endif - atomic64_set(&ireq->ir_cookie, 0); - ireq->ireq_state = TCP_NEW_SYN_RECV; - write_pnet(&ireq->ireq_net, sock_net(sk_listener)); - ireq->ireq_family = sk_listener->sk_family; - req->timeout = TCP_TIMEOUT_INIT; - } - - return req; -} -EXPORT_SYMBOL(inet_reqsk_alloc); - /* * Return true if a syncookie should be sent */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b710958393e6..fd17f25ff288 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -93,7 +93,9 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk); +static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static u32 tcp_v4_init_seq(const struct sk_buff *skb) { @@ -114,6 +116,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) const struct inet_timewait_sock *tw = inet_twsk(sktw); const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); + int ts_recent_stamp; if (reuse == 2) { /* Still does not detect *everything* that goes through @@ -152,10 +155,11 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) If TW bucket has been already destroyed we fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ - if (tcptw->tw_ts_recent_stamp && + ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); + if (ts_recent_stamp && (!twp || (reuse && time_after32(ktime_get_seconds(), - tcptw->tw_ts_recent_stamp)))) { - /* inet_twsk_hashdance() sets sk_refcnt after putting twsk + ts_recent_stamp)))) { + /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk * and releasing the bucket lock. */ if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt))) @@ -178,8 +182,8 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) if (!seq) seq = 1; WRITE_ONCE(tp->write_seq, seq); - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); + tp->rx_opt.ts_recent_stamp = ts_recent_stamp; } return 1; @@ -611,15 +615,10 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); - if (!sock_owned_by_user(sk)) { - WRITE_ONCE(sk->sk_err, err); - - sk_error_report(sk); - - tcp_done(sk); - } else { + if (!sock_owned_by_user(sk)) + tcp_done_with_error(sk, err); + else WRITE_ONCE(sk->sk_err_soft, err); - } goto out; } @@ -885,7 +884,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = this_cpu_read(ipv4_tcp_sk); + local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); + ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); + sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? @@ -910,6 +911,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); + local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); local_bh_enable(); #ifdef CONFIG_TCP_MD5SIG @@ -1005,7 +1007,8 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ctl_sk = this_cpu_read(ipv4_tcp_sk); + local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); + ctl_sk = this_cpu_read(ipv4_tcp_sk.sock); sock_net_set(ctl_sk, net); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); @@ -1020,6 +1023,7 @@ static void tcp_v4_send_ack(const struct sock *sk, sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); + local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock); local_bh_enable(); } @@ -1057,19 +1061,17 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) #else if (0) { #endif -#ifdef CONFIG_TCP_MD5SIG - } else if (static_branch_unlikely(&tcp_md5_needed.key)) { + } else if (static_branch_tcp_md5()) { key.md5_key = tcp_twsk_md5_key(tcptw); if (key.md5_key) key.type = TCP_KEY_MD5; -#endif } tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), - tcptw->tw_ts_recent, + READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, &key, tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, tw->tw_tos, @@ -1131,8 +1133,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, #else if (0) { #endif -#ifdef CONFIG_TCP_MD5SIG - } else if (static_branch_unlikely(&tcp_md5_needed.key)) { + } else if (static_branch_tcp_md5()) { const union tcp_md5_addr *addr; int l3index; @@ -1141,7 +1142,6 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key.md5_key) key.type = TCP_KEY_MD5; -#endif } tcp_v4_send_ack(sk, skb, seq, @@ -1939,7 +1939,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) reset: tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); discard: - kfree_skb_reason(skb, reason); + sk_skb_reason_drop(sk, skb, reason); /* Be careful here. If this function gets more complicated and * gcc suffers from register pressure on the x86, sk (in %ebx) * might be destroyed here. This current version compiles correctly, @@ -2049,8 +2049,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || - !mptcp_skb_can_collapse(tail, skb) || - skb_cmp_decrypted(tail, skb) || + !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) goto no_coalesce; @@ -2176,8 +2175,8 @@ int tcp_v4_rcv(struct sk_buff *skb) int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; + struct sock *sk = NULL; bool refcounted; - struct sock *sk; int ret; u32 isn; @@ -2376,7 +2375,7 @@ bad_packet: discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); /* Discard frame. */ - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; discard_and_relse: @@ -3506,6 +3505,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_shrink_window = 0; net->ipv4.sysctl_tcp_pingpong_thresh = 1; + net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); return 0; } @@ -3620,7 +3620,9 @@ void __init tcp_v4_init(void) */ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; - per_cpu(ipv4_tcp_sk, cpu) = sk; + sk->sk_clockid = CLOCK_MONOTONIC; + + per_cpu(ipv4_tcp_sk.sock, cpu) = sk; } if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 538c06f95918..bc67f6b9efae 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -101,16 +101,18 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; + int ts_recent_stamp; tmp_opt.saw_tstamp = 0; - if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { + ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); + if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; - tmp_opt.ts_recent = tcptw->tw_ts_recent; - tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent); + tmp_opt.ts_recent_stamp = ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } @@ -152,8 +154,10 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq); if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent_stamp = ktime_get_seconds(); - tcptw->tw_ts_recent = tmp_opt.rcv_tsval; + WRITE_ONCE(tcptw->tw_ts_recent_stamp, + ktime_get_seconds()); + WRITE_ONCE(tcptw->tw_ts_recent, + tmp_opt.rcv_tsval); } inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); @@ -197,8 +201,10 @@ kill: } if (tmp_opt.saw_tstamp) { - tcptw->tw_ts_recent = tmp_opt.rcv_tsval; - tcptw->tw_ts_recent_stamp = ktime_get_seconds(); + WRITE_ONCE(tcptw->tw_ts_recent, + tmp_opt.rcv_tsval); + WRITE_ONCE(tcptw->tw_ts_recent_stamp, + ktime_get_seconds()); } inet_twsk_put(tw); @@ -225,7 +231,7 @@ kill: if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || (tmp_opt.saw_tstamp && - (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { + (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; @@ -339,17 +345,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; - /* tw_timer is pinned, so we need to make sure BH are disabled - * in following section, otherwise timer handler could run before - * we complete the initialization. - */ - local_bh_disable(); - inet_twsk_schedule(tw, timeo); /* Linkage updates. * Note that access to tw after this point is illegal. */ - inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo); - local_bh_enable(); + inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 95618d0e78e4..16c48df8df4c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1301,7 +1301,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); - skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); if (clone_it) { oskb = skb; @@ -1655,7 +1655,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, skb_split(skb, buff, len); - skb_set_delivery_time(buff, skb->tstamp, true); + skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); @@ -2764,7 +2764,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ tp->tcp_wstamp_ns = tp->tcp_clock_cache; - skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ @@ -3752,11 +3752,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) skb_set_delivery_time(skb, cookie_init_timestamp(req, now), - true); + SKB_CLOCK_MONOTONIC); else #endif { - skb_set_delivery_time(skb, now, true); + skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3768,6 +3768,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_TCP_AO struct tcp_ao_key *ao_key = NULL; u8 keyid = tcp_rsk(req)->ao_keyid; + u8 rnext = tcp_rsk(req)->ao_rcv_next; ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req), keyid, -1); @@ -3777,6 +3778,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here. */ if (unlikely(!ao_key)) { + trace_tcp_ao_synack_no_key(sk, keyid, rnext); rcu_read_unlock(); kfree_skb(skb); net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n", @@ -3843,7 +3845,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); - skb_set_delivery_time(skb, now, true); + skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); tcp_add_tx_delay(skb, tp); return skb; @@ -4027,7 +4029,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true); + skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) @@ -4163,16 +4165,9 @@ EXPORT_SYMBOL(tcp_connect); u32 tcp_delack_max(const struct sock *sk) { - const struct dst_entry *dst = __sk_dst_get(sk); - u32 delack_max = inet_csk(sk)->icsk_delack_max; - - if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) { - u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); - u32 delack_from_rto_min = max_t(int, 1, rto_min - 1); + u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1; - delack_max = min_t(u32, delack_max, delack_from_rto_min); - } - return delack_max; + return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min); } /* Send out a delayed ack, the caller does the policy checking diff --git a/net/ipv4/tcp_sigpool.c b/net/ipv4/tcp_sigpool.c index 8512cb09ebc0..d8a4f192873a 100644 --- a/net/ipv4/tcp_sigpool.c +++ b/net/ipv4/tcp_sigpool.c @@ -10,7 +10,14 @@ #include <net/tcp.h> static size_t __scratch_size; -static DEFINE_PER_CPU(void __rcu *, sigpool_scratch); +struct sigpool_scratch { + local_lock_t bh_lock; + void __rcu *pad; +}; + +static DEFINE_PER_CPU(struct sigpool_scratch, sigpool_scratch) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; struct sigpool_entry { struct crypto_ahash *hash; @@ -72,7 +79,7 @@ static int sigpool_reserve_scratch(size_t size) break; } - old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch, cpu), + old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu), scratch, lockdep_is_held(&cpool_mutex)); if (!cpu_online(cpu) || !old_scratch) { kfree(old_scratch); @@ -93,7 +100,7 @@ static void sigpool_scratch_free(void) int cpu; for_each_possible_cpu(cpu) - kfree(rcu_replace_pointer(per_cpu(sigpool_scratch, cpu), + kfree(rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu), NULL, lockdep_is_held(&cpool_mutex))); __scratch_size = 0; } @@ -277,7 +284,8 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RC /* Pairs with tcp_sigpool_reserve_scratch(), scratch area is * valid (allocated) until tcp_sigpool_end(). */ - c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch)); + local_lock_nested_bh(&sigpool_scratch.bh_lock); + c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch.pad)); return 0; } EXPORT_SYMBOL_GPL(tcp_sigpool_start); @@ -286,6 +294,7 @@ void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH) { struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req); + local_unlock_nested_bh(&sigpool_scratch.bh_lock); rcu_read_unlock_bh(); ahash_request_free(c->req); crypto_free_ahash(hash); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 892c86657fbc..4d40615dc8fc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -74,11 +74,7 @@ u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when) static void tcp_write_err(struct sock *sk) { - WRITE_ONCE(sk->sk_err, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); - sk_error_report(sk); - - tcp_write_queue_purge(sk); - tcp_done(sk); + tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 578668878a85..49c622e743e8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -940,8 +940,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, kfree_skb(skb); return -EINVAL; } - if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || - dst_xfrm(skb_dst(skb))) { + if (is_udplite || dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } @@ -2076,7 +2075,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); trace_udp_fail_queue_rcv_skb(rc, sk, skb); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2198,7 +2197,7 @@ csum_error: drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2232,7 +2231,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old; if (dst_hold_safe(dst)) { - old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst); + old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst))); dst_release(old); return old != dst; } @@ -2385,7 +2384,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { - struct sock *sk; + struct sock *sk = NULL; struct udphdr *uh; unsigned short ulen; struct rtable *rt = skb_rtable(skb); @@ -2462,7 +2461,7 @@ no_sk: * Hmm. We got an UDP packet to a port to which we * don't wanna listen. Ignore it. */ - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; short_packet: @@ -2487,7 +2486,7 @@ csum_error: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); - kfree_skb_reason(skb, drop_reason); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 59448a2dbf2c..aa2e0a28ca61 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -357,6 +357,14 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, else uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0; + /* On the TX path, CHECKSUM_NONE and CHECKSUM_UNNECESSARY have the same + * meaning. However, check for bad offloads in the GSO stack expects the + * latter, if the checksum was calculated in software. To vouch for the + * segment skbs we actually need to set it on the gso_skb. + */ + if (gso_skb->ip_summed == CHECKSUM_NONE) + gso_skb->ip_summed = CHECKSUM_UNNECESSARY; + /* update refcount for the packet */ if (copy_dtor) { int delta = sum_truesize - gso_skb->truesize; |