From 254d900b801fc04aa524ff7bafe28fdd1dbf0ed6 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 14 Jul 2017 12:04:16 +0300 Subject: ipv4: ip_do_fragment: fix headroom tests Some time ago David Woodhouse reported skb_under_panic when we try to push ethernet header to fragmented ipv6 skbs. It was fixed for ipv6 by Florian Westphal in commit 1d325d217c7f ("ipv6: ip6_fragment: fix headroom tests and skb leak") However similar problem still exist in ipv4. It does not trigger skb_under_panic due paranoid check in ip_finish_output2, however according to Alexey Kuznetsov current state is abnormal and ip_fragment should be fixed too. Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7eb252dcecee..50c74cd890bc 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -599,6 +599,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; + ll_rs = LL_RESERVED_SPACE(rt->dst.dev); /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing @@ -614,14 +615,15 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (first_len - hlen > mtu || ((first_len - hlen) & 7) || ip_is_fragment(iph) || - skb_cloned(skb)) + skb_cloned(skb) || + skb_headroom(skb) < ll_rs) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || - skb_headroom(frag) < hlen) + skb_headroom(frag) < hlen + ll_rs) goto slow_path_clean; /* Partially cloned skb? */ @@ -711,8 +713,6 @@ slow_path: left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ - ll_rs = LL_RESERVED_SPACE(rt->dst.dev); - /* * Fragment the datagram. */ -- cgit From 4aea287e90dd61a48268ff2994b56f9799441b62 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 14 Jul 2017 17:49:21 -0400 Subject: tcp_bbr: cut pacing rate only if filled pipe In bbr_set_pacing_rate(), which decides whether to cut the pacing rate, there was some code that considered exiting STARTUP to be equivalent to the notion of filling the pipe (i.e., bbr_full_bw_reached()). Specifically, as the code was structured, exiting STARTUP and going into PROBE_RTT could cause us to cut the pacing rate down to something silly and low, based on whatever bandwidth samples we've had so far, when it's possible that all of them have been small app-limited bandwidth samples that are not representative of the bandwidth available in the path. (The code was correct at the time it was written, but the state machine changed without this spot being adjusted correspondingly.) Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index dbcc9352a48f..743e97511dc8 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -220,12 +220,11 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { - struct bbr *bbr = inet_csk_ca(sk); u64 rate = bw; rate = bbr_rate_bytes_per_sec(sk, rate, gain); rate = min_t(u64, rate, sk->sk_max_pacing_rate); - if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate) + if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) sk->sk_pacing_rate = rate; } -- cgit From f19fd62dafaf1ed6cf615dba655b82fa9df59074 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 14 Jul 2017 17:49:22 -0400 Subject: tcp_bbr: introduce bbr_bw_to_pacing_rate() helper Introduce a helper to convert a BBR bandwidth and gain factor to a pacing rate in bytes per second. This is a pure refactor, but is needed for two following fixes. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 743e97511dc8..29e23b851b97 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -211,6 +211,16 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) return rate >> BW_SCALE; } +/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + u64 rate = bw; + + rate = bbr_rate_bytes_per_sec(sk, rate, gain); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + return rate; +} + /* Pace using current bw estimate and a gain factor. In order to help drive the * network toward lower queues while maintaining high utilization and low * latency, the average pacing rate aims to be slightly (~1%) lower than the @@ -220,10 +230,8 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { - u64 rate = bw; + u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); - rate = bbr_rate_bytes_per_sec(sk, rate, gain); - rate = min_t(u64, rate, sk->sk_max_pacing_rate); if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) sk->sk_pacing_rate = rate; } -- cgit From 79135b89b8af304456bd67916b80116ddf03d7b6 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 14 Jul 2017 17:49:23 -0400 Subject: tcp_bbr: introduce bbr_init_pacing_rate_from_rtt() helper Introduce a helper to initialize the BBR pacing rate unconditionally, based on the current cwnd and RTT estimate. This is a pure refactor, but is needed for two following fixes. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 29e23b851b97..3276140c2506 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -221,6 +221,23 @@ static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) return rate; } +/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ +static void bbr_init_pacing_rate_from_rtt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + u64 bw; + u32 rtt_us; + + if (tp->srtt_us) { /* any RTT sample yet? */ + rtt_us = max(tp->srtt_us >> 3, 1U); + } else { /* no RTT sample yet */ + rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ + } + bw = (u64)tp->snd_cwnd * BW_UNIT; + do_div(bw, rtt_us); + sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); +} + /* Pace using current bw estimate and a gain factor. In order to help drive the * network toward lower queues while maintaining high utilization and low * latency, the average pacing rate aims to be slightly (~1%) lower than the @@ -805,7 +822,6 @@ static void bbr_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u64 bw; bbr->prior_cwnd = 0; bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ @@ -821,11 +837,8 @@ static void bbr_init(struct sock *sk) minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ - /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ - bw = (u64)tp->snd_cwnd * BW_UNIT; - do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC); sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ - bbr_set_pacing_rate(sk, bw, bbr_high_gain); + bbr_init_pacing_rate_from_rtt(sk); bbr->restore_cwnd = 0; bbr->round_start = 0; -- cgit From 1d3648eb5d1fe9ed3d095ed8fa19ad11ca4c8bc0 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 14 Jul 2017 17:49:24 -0400 Subject: tcp_bbr: remove sk_pacing_rate=0 transient during init Fix a corner case noticed by Eric Dumazet, where BBR's setting sk->sk_pacing_rate to 0 during initialization could theoretically cause packets in the sending host to hang if there were packets "in flight" in the pacing infrastructure at the time the BBR congestion control state is initialized. This could occur if the pacing infrastructure happened to race with bbr_init() in a way such that the pacer read the 0 rather than the immediately following non-zero pacing rate. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Reported-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 3276140c2506..42e0017f2ebc 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -837,7 +837,6 @@ static void bbr_init(struct sock *sk) minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ - sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ bbr_init_pacing_rate_from_rtt(sk); bbr->restore_cwnd = 0; -- cgit From 32984565574da7ed3afa10647bb4020d7a9e6c93 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 14 Jul 2017 17:49:25 -0400 Subject: tcp_bbr: init pacing rate on first RTT sample Fixes the following behavior: for connections that had no RTT sample at the time of initializing congestion control, BBR was initializing the pacing rate to a high nominal rate (based an a guess of RTT=1ms, in case this is LAN traffic). Then BBR never adjusted the pacing rate downward upon obtaining an actual RTT sample, if the connection never filled the pipe (e.g. all sends were small app-limited writes()). This fix adjusts the pacing rate upon obtaining the first RTT sample. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 42e0017f2ebc..69ee877574d0 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -112,7 +112,8 @@ struct bbr { cwnd_gain:10, /* current gain for setting cwnd */ full_bw_cnt:3, /* number of rounds without large bw gains */ cycle_idx:3, /* current index in pacing_gain cycle array */ - unused_b:6; + has_seen_rtt:1, /* have we seen an RTT sample yet? */ + unused_b:5; u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 full_bw; /* recent bw, to estimate if pipe is full */ }; @@ -225,11 +226,13 @@ static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) static void bbr_init_pacing_rate_from_rtt(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); u64 bw; u32 rtt_us; if (tp->srtt_us) { /* any RTT sample yet? */ rtt_us = max(tp->srtt_us >> 3, 1U); + bbr->has_seen_rtt = 1; } else { /* no RTT sample yet */ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ } @@ -247,8 +250,12 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); + if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) + bbr_init_pacing_rate_from_rtt(sk); if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) sk->sk_pacing_rate = rate; } @@ -837,6 +844,7 @@ static void bbr_init(struct sock *sk) minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ + bbr->has_seen_rtt = 0; bbr_init_pacing_rate_from_rtt(sk); bbr->restore_cwnd = 0; -- cgit From 974292defee033bc43ccfcb2fcefc3eba3905340 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jul 2017 13:29:03 +0200 Subject: netfilter: nf_tables: only allow in/output for arp packets arp packets cannot be forwarded. They can be bridged, but then they can be filtered using either ebtables or nftables bridge family. The bridge netfilter exposes a "call-arptables" switch which pushes packets into arptables, but lets not expose this for nftables, so better close this asap. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tables_arp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 805c8ddfe860..4bbc273b45e8 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -72,8 +72,7 @@ static const struct nf_chain_type filter_arp = { .family = NFPROTO_ARP, .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | - (1 << NF_ARP_OUT) | - (1 << NF_ARP_FORWARD), + (1 << NF_ARP_OUT), }; static int __init nf_tables_arp_init(void) -- cgit From 18bcf2907df935981266532e1e0d052aff2e6fae Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 17 Jul 2017 12:35:58 +0200 Subject: ipv4: ipv6: initialize treq->txhash in cookie_v[46]_check() KMSAN reported use of uninitialized memory in skb_set_hash_from_sk(), which originated from the TCP request socket created in cookie_v6_check(): ================================================================== BUG: KMSAN: use of uninitialized memory in tcp_transmit_skb+0xf77/0x3ec0 CPU: 1 PID: 2949 Comm: syz-execprog Not tainted 4.11.0-rc5+ #2931 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 TCP: request_sock_TCPv6: Possible SYN flooding on port 20028. Sending cookies. Check SNMP counters. Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x172/0x1c0 lib/dump_stack.c:52 kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:927 __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:469 skb_set_hash_from_sk ./include/net/sock.h:2011 tcp_transmit_skb+0xf77/0x3ec0 net/ipv4/tcp_output.c:983 tcp_send_ack+0x75b/0x830 net/ipv4/tcp_output.c:3493 tcp_delack_timer_handler+0x9a6/0xb90 net/ipv4/tcp_timer.c:284 tcp_delack_timer+0x1b0/0x310 net/ipv4/tcp_timer.c:309 call_timer_fn+0x240/0x520 kernel/time/timer.c:1268 expire_timers kernel/time/timer.c:1307 __run_timers+0xc13/0xf10 kernel/time/timer.c:1601 run_timer_softirq+0x36/0xa0 kernel/time/timer.c:1614 __do_softirq+0x485/0x942 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 irq_exit+0x1fa/0x230 kernel/softirq.c:405 exiting_irq+0xe/0x10 ./arch/x86/include/asm/apic.h:657 smp_apic_timer_interrupt+0x5a/0x80 arch/x86/kernel/apic/apic.c:966 apic_timer_interrupt+0x86/0x90 arch/x86/entry/entry_64.S:489 RIP: 0010:native_restore_fl ./arch/x86/include/asm/irqflags.h:36 RIP: 0010:arch_local_irq_restore ./arch/x86/include/asm/irqflags.h:77 RIP: 0010:__msan_poison_alloca+0xed/0x120 mm/kmsan/kmsan_instr.c:440 RSP: 0018:ffff880024917cd8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10 RAX: 0000000000000246 RBX: ffff8800224c0000 RCX: 0000000000000005 RDX: 0000000000000004 RSI: ffff880000000000 RDI: ffffea0000b6d770 RBP: ffff880024917d58 R08: 0000000000000dd8 R09: 0000000000000004 R10: 0000160000000000 R11: 0000000000000000 R12: ffffffff85abf810 R13: ffff880024917dd8 R14: 0000000000000010 R15: ffffffff81cabde4 poll_select_copy_remaining+0xac/0x6b0 fs/select.c:293 SYSC_select+0x4b4/0x4e0 fs/select.c:653 SyS_select+0x76/0xa0 fs/select.c:634 entry_SYSCALL_64_fastpath+0x13/0x94 arch/x86/entry/entry_64.S:204 RIP: 0033:0x4597e7 RSP: 002b:000000c420037ee0 EFLAGS: 00000246 ORIG_RAX: 0000000000000017 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004597e7 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 000000c420037ef0 R08: 000000c420037ee0 R09: 0000000000000059 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000042dc20 R13: 00000000000000f3 R14: 0000000000000030 R15: 0000000000000003 chained origin: save_stack_trace+0x37/0x40 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:302 kmsan_save_stack mm/kmsan/kmsan.c:317 kmsan_internal_chain_origin+0x12a/0x1f0 mm/kmsan/kmsan.c:547 __msan_store_shadow_origin_4+0xac/0x110 mm/kmsan/kmsan_instr.c:259 tcp_create_openreq_child+0x709/0x1ae0 net/ipv4/tcp_minisocks.c:472 tcp_v6_syn_recv_sock+0x7eb/0x2a30 net/ipv6/tcp_ipv6.c:1103 tcp_get_cookie_sock+0x136/0x5f0 net/ipv4/syncookies.c:212 cookie_v6_check+0x17a9/0x1b50 net/ipv6/syncookies.c:245 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:989 tcp_v6_do_rcv+0xdd8/0x1c60 net/ipv6/tcp_ipv6.c:1298 tcp_v6_rcv+0x41a3/0x4f00 net/ipv6/tcp_ipv6.c:1487 ip6_input_finish+0x82f/0x1ee0 net/ipv6/ip6_input.c:279 NF_HOOK ./include/linux/netfilter.h:257 ip6_input+0x239/0x290 net/ipv6/ip6_input.c:322 dst_input ./include/net/dst.h:492 ip6_rcv_finish net/ipv6/ip6_input.c:69 NF_HOOK ./include/linux/netfilter.h:257 ipv6_rcv+0x1dbd/0x22e0 net/ipv6/ip6_input.c:203 __netif_receive_skb_core+0x2f6f/0x3a20 net/core/dev.c:4208 __netif_receive_skb net/core/dev.c:4246 process_backlog+0x667/0xba0 net/core/dev.c:4866 napi_poll net/core/dev.c:5268 net_rx_action+0xc95/0x1590 net/core/dev.c:5333 __do_softirq+0x485/0x942 kernel/softirq.c:284 origin: save_stack_trace+0x37/0x40 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:302 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:198 kmsan_kmalloc+0x7f/0xe0 mm/kmsan/kmsan.c:337 kmem_cache_alloc+0x1c2/0x1e0 mm/slub.c:2766 reqsk_alloc ./include/net/request_sock.h:87 inet_reqsk_alloc+0xa4/0x5b0 net/ipv4/tcp_input.c:6200 cookie_v6_check+0x4f4/0x1b50 net/ipv6/syncookies.c:169 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:989 tcp_v6_do_rcv+0xdd8/0x1c60 net/ipv6/tcp_ipv6.c:1298 tcp_v6_rcv+0x41a3/0x4f00 net/ipv6/tcp_ipv6.c:1487 ip6_input_finish+0x82f/0x1ee0 net/ipv6/ip6_input.c:279 NF_HOOK ./include/linux/netfilter.h:257 ip6_input+0x239/0x290 net/ipv6/ip6_input.c:322 dst_input ./include/net/dst.h:492 ip6_rcv_finish net/ipv6/ip6_input.c:69 NF_HOOK ./include/linux/netfilter.h:257 ipv6_rcv+0x1dbd/0x22e0 net/ipv6/ip6_input.c:203 __netif_receive_skb_core+0x2f6f/0x3a20 net/core/dev.c:4208 __netif_receive_skb net/core/dev.c:4246 process_backlog+0x667/0xba0 net/core/dev.c:4866 napi_poll net/core/dev.c:5268 net_rx_action+0xc95/0x1590 net/core/dev.c:5333 __do_softirq+0x485/0x942 kernel/softirq.c:284 ================================================================== Similar error is reported for cookie_v4_check(). Fixes: 58d607d3e52f ("tcp: provide skb->hash to synack packets") Signed-off-by: Alexander Potapenko Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 0905cf04c2a4..03ad8778c395 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -335,6 +335,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; treq->ts_off = 0; + treq->txhash = net_tx_rndhash(); req->mss = mss; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; -- cgit From 0ddf3fb2c43d2e65aee5de158ed694ea11ef229d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Jul 2017 11:57:55 +0200 Subject: udp: preserve skb->dst if required for IP options processing Eric noticed that in udp_recvmsg() we still need to access skb->dst while processing the IP options. Since commit 0a463c78d25b ("udp: avoid a cache miss on dequeue") skb->dst is no more available at recvmsg() time and bad things will happen if we enter the relevant code path. This commit address the issue, avoid clearing skb->dst if any IP options are present into the relevant skb. Since the IP CB is contained in the first skb cacheline, we can test it to decide to leverage the consume_stateless_skb() optimization, without measurable additional cost in the faster path. v1 -> v2: updated commit message tags Fixes: 0a463c78d25b ("udp: avoid a cache miss on dequeue") Reported-by: Andrey Konovalov Reported-by: Eric Dumazet Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 25294d43e147..b057653ceca9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1388,6 +1388,11 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) unlock_sock_fast(sk, slow); } + /* we cleared the head states previously only if the skb lacks any IP + * options, see __udp_queue_rcv_skb(). + */ + if (unlikely(IPCB(skb)->opt.optlen > 0)) + skb_release_head_state(skb); consume_stateless_skb(skb); } EXPORT_SYMBOL_GPL(skb_consume_udp); @@ -1779,8 +1784,12 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id_once(sk, skb); } - /* clear all pending head states while they are hot in the cache */ - skb_release_head_state(skb); + /* At recvmsg() time we need skb->dst to process IP options-related + * cmsg, elsewhere can we clear all pending head states while they are + * hot in the cache + */ + if (likely(IPCB(skb)->opt.optlen == 0)) + skb_release_head_state(skb); rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { -- cgit From 8799a221f5944a7d74516ecf46d58c28ec1d1f75 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Wed, 19 Jul 2017 15:41:33 -0700 Subject: ipv4: initialize fib_trie prior to register_netdev_notifier call. Net stack initialization currently initializes fib-trie after the first call to netdevice_notifier() call. In fact fib_trie initialization needs to happen before first rtnl_register(). It does not cause any problem since there are no devices UP at this moment, but trying to bring 'lo' UP at initialization would make this assumption wrong and exposes the issue. Fixes following crash Call Trace: ? alternate_node_alloc+0x76/0xa0 fib_table_insert+0x1b7/0x4b0 fib_magic.isra.17+0xea/0x120 fib_add_ifaddr+0x7b/0x190 fib_netdev_event+0xc0/0x130 register_netdevice_notifier+0x1c1/0x1d0 ip_fib_init+0x72/0x85 ip_rt_init+0x187/0x1e9 ip_init+0xe/0x1a inet_init+0x171/0x26c ? ipv4_offload_init+0x66/0x66 do_one_initcall+0x43/0x160 kernel_init_freeable+0x191/0x219 ? rest_init+0x80/0x80 kernel_init+0xe/0x150 ret_from_fork+0x22/0x30 Code: f6 46 23 04 74 86 4c 89 f7 e8 ae 45 01 00 49 89 c7 4d 85 ff 0f 85 7b ff ff ff 31 db eb 08 4c 89 ff e8 16 47 01 00 48 8b 44 24 38 <45> 8b 6e 14 4d 63 76 74 48 89 04 24 0f 1f 44 00 00 48 83 c4 08 RIP: kmem_cache_alloc+0xcf/0x1c0 RSP: ffff9b1500017c28 CR2: 0000000000000014 Fixes: 7b1a74fdbb9e ("[NETNS]: Refactor fib initialization so it can handle multiple namespaces.") Fixes: 7f9b80529b8a ("[IPV4]: fib hash|trie initialization") Signed-off-by: Mahesh Bandewar Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4e678fa892dd..044d2a159a3c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1334,13 +1334,14 @@ static struct pernet_operations fib_net_ops = { void __init ip_fib_init(void) { - rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); - rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); + fib_trie_init(); register_pernet_subsys(&fib_net_ops); + register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - fib_trie_init(); + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); } -- cgit