From fb286bb2990a107009dbf25f6ffebeb7df77f9be Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 10 Nov 2005 13:01:24 -0800 Subject: [NET]: Detect hardware rx checksum faults correctly Here is the patch that introduces the generic skb_checksum_complete which also checks for hardware RX checksum faults. If that happens, it'll call netdev_rx_csum_fault which currently prints out a stack trace with the device name. In future it can turn off RX checksum. I've converted every spot under net/ that does RX checksum checks to use skb_checksum_complete or __skb_checksum_complete with the exceptions of: * Those places where checksums are done bit by bit. These will call netdev_rx_csum_fault directly. * The following have not been completely checked/converted: ipmr ip_vs netfilter dccp This patch is based on patches and suggestions from Stephen Hemminger and David S. Miller. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 6 +++--- net/ipv4/igmp.c | 19 ++++++++++++++----- net/ipv4/ip_gre.c | 15 ++++++++------- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 11 ++++------- net/ipv4/tcp_ipv4.c | 24 +++++++++--------------- net/ipv4/udp.c | 7 ++----- 6 files changed, 40 insertions(+), 42 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 175e093ec564..e3eceecd0496 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb) case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) + skb->csum = 0; + if (__skb_checksum_complete(skb)) goto error; - default:; } if (!pskb_pull(skb, sizeof(struct icmphdr))) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c6247fc84060..c04607b49212 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb) return 0; } - if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || - (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { - in_dev_put(in_dev); - kfree_skb(skb); - return 0; + if (!pskb_may_pull(skb, sizeof(struct igmphdr))) + goto drop; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + if (__skb_checksum_complete(skb)) + goto drop; } ih = skb->h.igmph; @@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb) default: NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); } + +drop: in_dev_put(in_dev); kfree_skb(skb); return 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 896ce3f8f53a..4e9c74b54b15 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb) goto drop_nolock; if (flags&GRE_CSUM) { - if (skb->ip_summed == CHECKSUM_HW) { + switch (skb->ip_summed) { + case CHECKSUM_HW: csum = (u16)csum_fold(skb->csum); - if (csum) - skb->ip_summed = CHECKSUM_NONE; - } - if (skb->ip_summed == CHECKSUM_NONE) { - skb->csum = skb_checksum(skb, 0, skb->len, 0); + if (!csum) + break; + /* fall through */ + case CHECKSUM_NONE: + skb->csum = 0; + csum = __skb_checksum_complete(skb); skb->ip_summed = CHECKSUM_HW; - csum = (u16)csum_fold(skb->csum); } offset += 4; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 5198f3a1e2cd..e4d6b268e8c4 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, - "ip_ct_icmp: bad HW ICMP checksum "); - return -NF_ACCEPT; + /* fall through */ case CHECKSUM_NONE: - if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + skb->csum = 0; + if (__skb_checksum_complete(skb)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad ICMP checksum "); return -NF_ACCEPT; } - default: - break; } checksum_skipped: diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 634dabb558fd..ac1fcf5b4ebc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1110,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) static int tcp_v4_checksum_init(struct sk_buff *skb) { if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, skb->csum)) + skb->nh.iph->daddr, skb->csum)) { + skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; - - LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); - skb->ip_summed = CHECKSUM_NONE; + } } + + skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len, IPPROTO_TCP, 0); + if (skb->len <= 76) { - if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, - skb->nh.iph->daddr, - skb_checksum(skb, 0, skb->len, 0))) - return -1; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { - skb->csum = ~tcp_v4_check(skb->h.th, skb->len, - skb->nh.iph->saddr, - skb->nh.iph->daddr, 0); + return __skb_checksum_complete(skb); } return 0; } @@ -1219,7 +1213,7 @@ int tcp_v4_rcv(struct sk_buff *skb) * provided case of th->doff==0 is elimineted. * So, we defer the checks. */ if ((skb->ip_summed != CHECKSUM_UNNECESSARY && - tcp_v4_checksum_init(skb) < 0)) + tcp_v4_checksum_init(skb))) goto bad_packet; th = skb->h.th; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e0bd1013cb0d..2422a5f7195d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) static __inline__ int __udp_checksum_complete(struct sk_buff *skb) { - return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); + return __skb_checksum_complete(skb); } static __inline__ int udp_checksum_complete(struct sk_buff *skb) @@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, if (uh->check == 0) { skb->ip_summed = CHECKSUM_UNNECESSARY; } else if (skb->ip_summed == CHECKSUM_HW) { - skb->ip_summed = CHECKSUM_UNNECESSARY; if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) - return 0; - LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; } if (skb->ip_summed != CHECKSUM_UNNECESSARY) skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); -- cgit From f4805eded7d38c4e42bf473dc5eb2f34853beb06 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 16:53:30 -0800 Subject: [TCP]: fix congestion window update when using TSO deferal TCP peformance with TSO over networks with delay is awful. On a 100Mbit link with 150ms delay, we get 4Mbits/sec with TSO and 50Mbits/sec without TSO. The problem is with TSO, we intentionally do not keep the maximum number of packets in flight to fill the window, we hold out to until we can send a MSS chunk. But, we also don't update the congestion window unless we have filled, as per RFC2861. This patch replaces the check for the congestion window being full with something smarter that accounts for TSO. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 2 +- net/ipv4/tcp_cong.c | 2 +- net/ipv4/tcp_highspeed.c | 4 ++-- net/ipv4/tcp_htcp.c | 2 +- net/ipv4/tcp_hybla.c | 6 +++--- net/ipv4/tcp_output.c | 1 + net/ipv4/tcp_scalable.c | 3 ++- 7 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ae35e0609047..5af99b3ef5d7 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -217,7 +217,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, bictcp_low_utilization(sk, data_acked); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e89..0705b496c6b3 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -186,7 +186,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6acc04bde080..5e56ad368dd2 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -111,12 +111,12 @@ static void hstcp_init(struct sock *sk) } static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, int good) + u32 in_flight, u32 pkts_acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index e47b37984e95..404a326ba345 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -207,7 +207,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, struct tcp_sock *tp = tcp_sk(sk); struct htcp *ca = inet_csk_ca(sk); - if (in_flight < tp->snd_cwnd) + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 77add63623df..40dbb3877510 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, ca->minrtt = tp->srtt; } + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + if (!ca->hybla_en) return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); - if (in_flight < tp->snd_cwnd) - return; - if (ca->rho == 0) hybla_recalc_param(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b907456a79f4..998f6416ef8b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2058,3 +2058,4 @@ EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); +EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 327770bf5522..a2fd25617d24 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -20,7 +20,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { struct tcp_sock *tp = tcp_sk(sk); - if (in_flight < tp->snd_cwnd) + + if (!tcp_is_cwnd_limited(sk, in_flight)) return; if (tp->snd_cwnd <= tp->snd_ssthresh) { -- cgit From 2d2abbab63f6726a147ae61ada39bf2c9ee0db9a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 16:56:12 -0800 Subject: [TCP]: simplify microsecond rtt sampling Simplify the code that comuputes microsecond rtt estimate used by TCP Vegas. Move the callback out of the RTT sampler and into the end of the ack cleanup. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 62 +++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 32 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e98b57578dc..e43065654930 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -548,10 +548,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { struct tcp_sock *tp = tcp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's @@ -610,9 +609,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); tp->rtt_seq = tp->snd_nxt; } - - if (icsk->icsk_ca_ops->rtt_sample) - icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -1921,7 +1917,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, int flag) { /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment @@ -1940,13 +1936,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1960,21 +1956,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(sk, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); } static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, - const s32 seq_rtt, u32 *usrtt) + const s32 seq_rtt) { const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(sk, usrtt, flag); + tcp_ack_saw_tstamp(sk, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(sk, seq_rtt, flag); } static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, @@ -2054,20 +2050,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } +static inline u32 tcp_usrtt(const struct sk_buff *skb) +{ + struct timeval tv, now; + + do_gettimeofday(&now); + skb_get_timestamp(skb, &tv); + return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec); +} /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; - struct timeval usnow; u32 pkts_acked = 0; - - if (seq_usrtt) - do_gettimeofday(&usnow); + void (*rtt_sample)(struct sock *sk, u32 usrtt) + = icsk->icsk_ca_ops->rtt_sample; while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2107,16 +2110,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt tp->retrans_out -= tcp_skb_pcount(skb); acked |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - if (seq_usrtt) { - struct timeval tv; - - skb_get_timestamp(skb, &tv); - *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 - + (usnow.tv_usec - tv.tv_usec); + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); } - if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2126,8 +2124,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt !before(scb->end_seq, tp->snd_up)) tp->urg_mode = 0; } - } else if (seq_rtt < 0) + } else if (seq_rtt < 0) { seq_rtt = now - scb->when; + if (rtt_sample) + (*rtt_sample)(sk, tcp_usrtt(skb)); + } tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); __skb_unlink(skb, &sk->sk_write_queue); @@ -2135,8 +2136,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt } if (acked&FLAG_ACKED) { - const struct inet_connection_sock *icsk = inet_csk(sk); - tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); + tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk, tp); if (icsk->icsk_ca_ops->pkts_acked) @@ -2299,7 +2299,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; - s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2352,8 +2351,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt, - icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); @@ -4242,7 +4240,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, NULL, 0); + tcp_ack_saw_tstamp(sk, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; -- cgit From 7faffa1c7fb9b8e8917e3225d4e2638270c0a48b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:07:24 -0800 Subject: [TCP]: add tcp_slow_start helper Move all the code that does linear TCP slowstart to one inline function to ease later patch to add ABC support. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 10 ++++------ net/ipv4/tcp_cong.c | 11 +++++------ net/ipv4/tcp_highspeed.c | 7 +++---- net/ipv4/tcp_htcp.c | 11 +++++------ net/ipv4/tcp_scalable.c | 11 +++++------ net/ipv4/tcp_vegas.c | 42 +++++++++++------------------------------- 6 files changed, 33 insertions(+), 59 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 5af99b3ef5d7..1d0cd86621b1 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -220,14 +220,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { bictcp_update(ca, tp->snd_cwnd); - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= ca->cnt) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 0705b496c6b3..6d3e883b48f6 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,12 +189,11 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. + /* In "safe" area, increase. */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 5e56ad368dd2..82b3c189bd7d 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -119,10 +119,9 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 404a326ba345..3284cfb993e6 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -210,11 +210,10 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + measure_rtt(sk); /* keep track of number of round-trip times since last backoff event */ @@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, htcp_alpha_update(ca); } - /* In dangerous area, increase slowly. + /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd */ if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index a2fd25617d24..26d7486ee501 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -24,17 +24,16 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { - tp->snd_cwnd++; - } else { + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { tp->snd_cwnd_cnt++; if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ - tp->snd_cwnd++; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } } - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - tp->snd_cwnd_stamp = tcp_time_stamp; } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 93c5f92070f9..4376814d29fb 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, cnt); } else { u32 rtt, target_cwnd, diff; @@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, */ diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - if (tp->snd_cwnd < tp->snd_ssthresh) { + if (tp->snd_cwnd <= tp->snd_ssthresh) { /* Slow start. */ if (diff > gamma) { /* Going too fast. Time to slow down @@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, V_PARAM_SHIFT)+1); } + tcp_slow_start(tp); } else { /* Congestion avoidance. */ u32 next_snd_cwnd; @@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, else if (next_snd_cwnd < tp->snd_cwnd) tp->snd_cwnd--; } - } - /* Wipe the slate clean for the next RTT. */ - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } } - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; } /* Extract info for Tcp socket info provided via netlink. */ -- cgit From 9772efb970780aeed488c19d8b4afd46c3b484af Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:09:53 -0800 Subject: [TCP]: Appropriate Byte Count support This is an updated version of the RFC3465 ABC patch originally for Linux 2.6.11-rc4 by Yee-Ting Li. ABC is a way of counting bytes ack'd rather than packets when updating congestion control. The orignal ABC described in the RFC applied to a Reno style algorithm. For advanced congestion control there is little change after leaving slow start. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 8 ++++++++ net/ipv4/tcp.c | 1 + net/ipv4/tcp_cong.c | 31 ++++++++++++++++++++----------- net/ipv4/tcp_input.c | 7 +++++++ net/ipv4/tcp_minisocks.c | 1 + 5 files changed, 37 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 652685623519..01444a02b48b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_tcp_congestion_control, .strategy = &sysctl_tcp_congestion_control, }, + { + .ctl_name = NET_TCP_ABC, + .procname = "tcp_abc", + .data = &sysctl_tcp_abc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 72b7c22e1ea5..cfaf76133759 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6d3e883b48f6..c7cc62c8dc12 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -192,17 +192,26 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, /* In "safe" area, increase. */ if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp); - else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else - tp->snd_cwnd_cnt++; - } + + /* In dangerous area, increase slowly. */ + else if (sysctl_tcp_abc) { + /* RFC3465: Apppriate Byte Count + * increase once for each full cwnd acked + */ + if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { + tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { + /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e43065654930..4cb5e6f408dc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -89,6 +89,7 @@ int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf = 1; +int sysctl_tcp_abc = 1; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ @@ -1247,6 +1248,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; + tp->bytes_acked = 0; tcp_clear_retrans(tp); /* Push undo marker, if it was plain RTO and nothing @@ -1904,6 +1906,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, TCP_ECN_queue_cwr(tp); } + tp->bytes_acked = 0; tp->snd_cwnd_cnt = 0; tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -2310,6 +2313,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) goto old_ack; + if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR) + tp->bytes_acked += ack - prior_snd_una; + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -4370,6 +4376,7 @@ discard: EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); +EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b1a63b2c6b4a..9203a21e299f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, */ newtp->snd_cwnd = 2; newtp->snd_cwnd_cnt = 0; + newtp->bytes_acked = 0; newtp->frto_counter = 0; newtp->frto_highmark = 0; -- cgit From 326f36e9e7de362e09745ce6f84b65e7ccac33ba Mon Sep 17 00:00:00 2001 From: John Heffner Date: Thu, 10 Nov 2005 17:11:48 -0800 Subject: [TCP]: receive buffer growth limiting with mixed MTU This is a patch for discussion addressing some receive buffer growing issues. This is partially related to the thread "Possible BUG in IPv4 TCP window handling..." last week. Specifically it addresses the problem of an interaction between rcvbuf moderation (receiver autotuning) and rcv_ssthresh. The problem occurs when sending small packets to a receiver with a larger MTU. (A very common case I have is a host with a 1500 byte MTU sending to a host with a 9k MTU.) In such a case, the rcv_ssthresh code is targeting a window size corresponding to filling up the current rcvbuf, not taking into account that the new rcvbuf moderation may increase the rcvbuf size. One hunk makes rcv_ssthresh use tcp_rmem[2] as the size target rather than rcvbuf. The other changes the behavior when it overflows its memory bounds with in-order data so that it tries to grow rcvbuf (the same as with out-of-order data). These changes should help my problem of mixed MTUs, and should also help the case from last week's thread I think. (In both cases though you still need tcp_rmem[2] to be set much larger than the TCP window.) One question is if this is too aggressive at trying to increase rcvbuf if it's under memory stress. Orignally-from: John Heffner Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4cb5e6f408dc..827cd4b9e867 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -234,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; - int window = tcp_full_space(sk)/2; + int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -327,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - unsigned int app_win = tp->rcv_nxt - tp->copied_seq; - int ofo_win = 0; icsk->icsk_ack.quick = 0; - skb_queue_walk(&tp->out_of_order_queue, skb) { - ofo_win += skb->len; - } - - /* If overcommit is due to out of order segments, - * do not clamp window. Try to expand rcvbuf instead. - */ - if (ofo_win) { - if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !tcp_memory_pressure && - atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) - sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), - sysctl_tcp_rmem[2]); + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); } - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - app_win += ofo_win; - if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) - app_win >>= 1; - if (app_win > icsk->icsk_ack.rcv_mss) - app_win -= icsk->icsk_ack.rcv_mss; - app_win = max(app_win, 2U*tp->advmss); - + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); - } } /* Receiver "autotuning" code. -- cgit From caa20d9abe810be2ede9612b6c9db6ce7d6edf80 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:13:47 -0800 Subject: [TCP]: spelling fixes Minor spelling fixes for TCP code. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 40 ++++++++++++++++++++-------------------- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv4/tcp_minisocks.c | 6 +++--- net/ipv4/tcp_output.c | 6 +++--- net/ipv4/tcp_timer.c | 4 ++-- 6 files changed, 31 insertions(+), 31 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cfaf76133759..9ac7a4f46bd8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { - /* The last check adjusts for discrepance of Linux wrt. RFC + /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 827cd4b9e867..34cfa58eab76 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -42,7 +42,7 @@ * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. - * Andrey Savochkin: Fix RTT measurements in the presnce of + * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming @@ -224,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening - * window and then starts to feed us spagetti. But it should work + * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ @@ -278,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); /* Try to select rcvbuf so that 4 mss-sized segments - * will fit to window and correspoding skbs will fit to our rcvbuf. + * will fit to window and corresponding skbs will fit to our rcvbuf. * (was 3; 4 is minimum to allow fast retransmit to work.) */ while (tcp_win_from_space(rcvmem) < tp->advmss) @@ -287,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); } -/* 4. Try to fixup all. It is made iimediately after connection enters +/* 4. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) @@ -367,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the - * non-timestamp case, we do not smoothe things out - * else with timestamps disabled convergance takes too + * non-timestamp case, we do not smoother things out + * else with timestamps disabled convergence takes too * long. */ if (!win_dep) { @@ -377,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) } else if (m < new_sample) new_sample = m << 3; } else { - /* No previous mesaure. */ + /* No previous measure. */ new_sample = m << 3; } @@ -506,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { - /* Too long gap. Apparently sender falled to + /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk); @@ -546,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be incresed fastly, decrease too fastly + * too slowly, when it should be increased fastly, decrease too fastly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) @@ -607,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic - * ACKs in some curcumstances. + * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced - * with correct one. It is exaclty, which we pretend to do. + * with correct one. It is exactly, which we pretend to do. */ } @@ -772,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) * to make it more realistic. * * A bit of theory. RTT is time passed after "normal" sized packet - * is sent until it is ACKed. In normal curcumstances sending small + * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever @@ -1899,7 +1899,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } /* Read draft-ietf-tcplw-high-performance before mucking - * with this code. (Superceeds RFC1323) + * with this code. (Supersedes RFC1323) */ static void tcp_ack_saw_tstamp(struct sock *sk, int flag) { @@ -1912,7 +1912,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, int flag) * 1998/04/10 Andrey V. Savochkin * * Changed: reset backoff as soon as we see the first valid sample. - * If we do not, we get strongly overstimated rto. With timestamps + * If we do not, we get strongly overestimated rto. With timestamps * samples are accepted even from very old segments: f.e., when rtt=1 * increases to 8, we retransmit 5 times and after 8 seconds delayed * answer arrives rto becomes 120 seconds! If at least one of segments @@ -2268,7 +2268,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) } /* F-RTO affects on two new ACKs following RTO. - * At latest on third ACK the TCP behavor is back to normal. + * At latest on third ACK the TCP behavior is back to normal. */ tp->frto_counter = (tp->frto_counter + 1) % 3; } @@ -2344,7 +2344,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(sk, flag)) { - /* Advanve CWND, if state allows this. */ + /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); @@ -3133,7 +3133,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, { struct sk_buff *skb; - /* First, check that queue is collapsable and find + /* First, check that queue is collapsible and find * the point where collapsing can be useful. */ for (skb = head; skb != tail; ) { /* No new bits? It is possible on ofo queue. */ @@ -3441,7 +3441,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) /* * This routine is only called when we have urgent data - * signalled. Its the 'slow' part of tcp_urg. It could be + * signaled. Its the 'slow' part of tcp_urg. It could be * moved inline now as tcp_urg is only called from one * place. We handle URGent data wrong. We have to - as * BSD still doesn't use the correction from RFC961. @@ -3486,7 +3486,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) * urgent. To do this requires some care. We cannot just ignore * tp->copied_seq since we would read the last urgent byte again * as data, nor can we alter copied_seq until this data arrives - * or we break the sematics of SIOCATMARK (and thus sockatmark()) + * or we break the semantics of SIOCATMARK (and thus sockatmark()) * * NOTE. Double Dutch. Rendering to plain English: author of comment * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); @@ -3631,7 +3631,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rx_opt.saw_tstamp = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd - * if header_predition is to be made + * if header_prediction is to be made * 'S' will always be tp->tcp_header_len >> 2 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to * turn it off (when there are holes in the receive diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ac1fcf5b4ebc..4d5021e1929b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -39,7 +39,7 @@ * request_sock handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. - * Added new listen sematics. + * Added new listen semantics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. @@ -1210,7 +1210,7 @@ int tcp_v4_rcv(struct sk_buff *skb) /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, - * provided case of th->doff==0 is elimineted. + * provided case of th->doff==0 is eliminated. * So, we defer the checks. */ if ((skb->ip_summed != CHECKSUM_UNNECESSARY && tcp_v4_checksum_init(skb))) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 9203a21e299f..1b66a2ac4321 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -158,7 +158,7 @@ kill_with_rst: /* I am shamed, but failed to make it more elegant. * Yes, it is direct reference to IP, which is impossible * to generalize to IPv6. Taking into account that IPv6 - * do not undertsnad recycling in any case, it not + * do not understand recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && @@ -194,7 +194,7 @@ kill_with_rst: /* In window segment, it may be only reset or bare ack. */ if (th->rst) { - /* This is TIME_WAIT assasination, in two flavors. + /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ @@ -551,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet - * sent (the segment carries an unaccaptable ACK) ... + * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 998f6416ef8b..602e7057e438 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -599,7 +599,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) for TCP options, but includes only bare TCP header. tp->rx_opt.mss_clamp is mss negotiated at connection setup. - It is minumum of user_mss and mss received with SYN. + It is minimum of user_mss and mss received with SYN. It also does not include TCP options. tp->pmtu_cookie is last pmtu, seen by this function. @@ -1171,7 +1171,7 @@ u32 __tcp_select_window(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - /* MSS for the peer's data. Previous verions used mss_clamp + /* MSS for the peer's data. Previous versions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss @@ -1361,7 +1361,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) int err; /* Do not sent more than we queued. 1/4 is reserved for possible - * copying overhead: frgagmentation, tunneling, mangling etc. + * copying overhead: fragmentation, tunneling, mangling etc. */ if (atomic_read(&sk->sk_wmem_alloc) > min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 415ee47ac1c5..e1880959614a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) * to prevent DoS attacks. It is called when a retransmission timeout * or zero probe timeout occurs on orphaned socket. * - * Criterium is still not confirmed experimentally and may change. + * Criteria is still not confirmed experimentally and may change. * We kill the socket, if: * 1. If number of orphaned sockets exceeds an administratively configured * limit. @@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) hole detection. :-( It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any + to make it. It is disgusting. It does not work in any case. Let me to cite the same draft, which requires for us to implement this: -- cgit From 6a438bbe68c7013a42d9c5aee5a40d7dafdbe6ec Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 10 Nov 2005 17:14:59 -0800 Subject: [TCP]: speed up SACK processing Use "hints" to speed up the SACK processing. Various forms of this have been used by TCP developers (Web100, STCP, BIC) to avoid the 2x linear search of outstanding segments. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 144 ++++++++++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_output.c | 54 +++++++++++++++---- 2 files changed, 172 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 34cfa58eab76..40a26b7157b4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -897,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int prior_fackets; u32 lost_retrans = 0; int flag = 0; + int dup_sack = 0; int i; if (!tp->sacked_out) tp->fackets_out = 0; prior_fackets = tp->fackets_out; - for (i=0; istart_seq); - __u32 end_seq = ntohl(sp->end_seq); - int fack_count = 0; - int dup_sack = 0; + /* SACK fastpath: + * if the only SACK change is the increase of the end_seq of + * the first block then only apply that SACK block + * and use retrans queue hinting otherwise slowpath */ + flag = 1; + for (i = 0; i< num_sacks; i++) { + __u32 start_seq = ntohl(sp[i].start_seq); + __u32 end_seq = ntohl(sp[i].end_seq); + + if (i == 0){ + if (tp->recv_sack_cache[i].start_seq != start_seq) + flag = 0; + } else { + if ((tp->recv_sack_cache[i].start_seq != start_seq) || + (tp->recv_sack_cache[i].end_seq != end_seq)) + flag = 0; + } + tp->recv_sack_cache[i].start_seq = start_seq; + tp->recv_sack_cache[i].end_seq = end_seq; /* Check for D-SACK. */ if (i == 0) { @@ -940,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (before(ack, prior_snd_una - tp->max_window)) return 0; } + } + + if (flag) + num_sacks = 1; + else { + int j; + tp->fastpath_skb_hint = NULL; + + /* order SACK blocks to allow in order walk of the retrans queue */ + for (i = num_sacks-1; i > 0; i--) { + for (j = 0; j < i; j++){ + if (after(ntohl(sp[j].start_seq), + ntohl(sp[j+1].start_seq))){ + sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); + sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); + sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); + sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); + } + + } + } + } + + /* clear flag as used for different purpose in following code */ + flag = 0; + + for (i=0; istart_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count; + + /* Use SACK fastpath hint if valid */ + if (tp->fastpath_skb_hint) { + skb = tp->fastpath_skb_hint; + fack_count = tp->fastpath_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + fack_count = 0; + } /* Event "B" in the comment above. */ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue(skb, sk) { + sk_stream_for_retrans_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + tp->fastpath_skb_hint = skb; + tp->fastpath_cnt_hint = fack_count; + /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ @@ -1023,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= tcp_skb_pcount(skb); tp->retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } else { /* New sack for not retransmitted frame, @@ -1035,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (sacked & TCPCB_LOST) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; tp->lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; } } @@ -1058,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + tp->retransmit_skb_hint = NULL; } } } @@ -1085,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); + /* clear lost hint */ + tp->retransmit_skb_hint = NULL; + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1192,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1258,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); + + clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -1482,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, int packets, u32 high_seq) { struct sk_buff *skb; - int cnt = packets; + int cnt; - BUG_TRAP(cnt <= tp->packets_out); + BUG_TRAP(packets <= tp->packets_out); + if (tp->lost_skb_hint) { + skb = tp->lost_skb_hint; + cnt = tp->lost_cnt_hint; + } else { + skb = sk->sk_write_queue.next; + cnt = 0; + } - sk_stream_for_retrans_queue(skb, sk) { - cnt -= tcp_skb_pcount(skb); - if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + sk_stream_for_retrans_queue_from(skb, sk) { + /* TODO: do this better */ + /* this is not the most efficient way to do this... */ + tp->lost_skb_hint = skb; + tp->lost_cnt_hint = cnt; + cnt += tcp_skb_pcount(skb); + if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retransmit_queue hints + * if this is beyond hint */ + if(tp->retransmit_skb_hint != NULL && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { + + tp->retransmit_skb_hint = NULL; + } } } tcp_sync_left_out(tp); @@ -1519,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) if (tcp_head_timedout(sk, tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(sk, skb) && - !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint + : sk->sk_write_queue.next; + + sk_stream_for_retrans_queue_from(skb, sk) { + if (!tcp_skb_timedout(sk, skb)) + break; + + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); + + /* clear xmit_retrans hint */ + if (tp->retransmit_skb_hint && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + + tp->retransmit_skb_hint = NULL; } } + + tp->scoreboard_skb_hint = skb; + tcp_sync_left_out(tp); } } @@ -1605,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp; + + /* There is something screwy going on with the retrans hints after + an undo */ + clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -1688,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) sk_stream_for_retrans_queue(skb, sk) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } + + clear_all_retrans_hints(tp); + DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; @@ -2117,6 +2230,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_packets_out_dec(tp, skb); __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); + clear_all_retrans_hints(tp); } if (acked&FLAG_ACKED) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 602e7057e438..029c70dfb585 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss u16 flags; BUG_ON(len > skb->len); + + clear_all_retrans_hints(tp); nsize = skb_headlen(skb) - len; if (nsize < 0) nsize = 0; @@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* Ok. We will be able to collapse the packet. */ + /* changing transmit queue under us so clear hints */ + clear_all_retrans_hints(tp); + + /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, &sk->sk_write_queue); memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); @@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk) } } + clear_all_retrans_hints(tp); + if (!lost) return; @@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int packet_cnt = tp->lost_out; + int packet_cnt; + + if (tp->retransmit_skb_hint) { + skb = tp->retransmit_skb_hint; + packet_cnt = tp->retransmit_cnt_hint; + }else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } /* First pass: retransmit lost packets. */ - if (packet_cnt) { - sk_stream_for_retrans_queue(skb, sk) { + if (tp->lost_out) { + sk_stream_for_retrans_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + /* we could do better than to assign each time */ + tp->retransmit_skb_hint = skb; + tp->retransmit_cnt_hint = packet_cnt; + /* Assume this retransmit will generate * only one packet for congestion window * calculation purposes. This works because @@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; - if (sacked&TCPCB_LOST) { + if (sacked & TCPCB_LOST) { if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->retransmit_skb_hint = NULL; return; + } if (icsk->icsk_ca_state != TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else @@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) TCP_RTO_MAX); } - packet_cnt -= tcp_skb_pcount(skb); - if (packet_cnt <= 0) + packet_cnt += tcp_skb_pcount(skb); + if (packet_cnt >= tp->lost_out) break; } } @@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_may_send_now(sk, tp)) return; - packet_cnt = 0; + if (tp->forward_skb_hint) { + skb = tp->forward_skb_hint; + packet_cnt = tp->forward_cnt_hint; + } else{ + skb = sk->sk_write_queue.next; + packet_cnt = 0; + } + + sk_stream_for_retrans_queue_from(skb, sk) { + tp->forward_cnt_hint = packet_cnt; + tp->forward_skb_hint = skb; - sk_stream_for_retrans_queue(skb, sk) { /* Similar to the retransmit loop above we * can pretend that the retransmitted SKB * we send out here will be composed of one @@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) continue; /* Ok, retransmit it. */ - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb)) { + tp->forward_skb_hint = NULL; break; + } if (skb == skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, -- cgit From c050970a257a4060e927e497a12323e961fcbadc Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Fri, 11 Nov 2005 04:43:47 -0500 Subject: [PATCH] TCP: fix vegas build Recent TCP changes broke the build. Signed-off-by: Jeff Garzik Signed-off-by: Linus Torvalds --- net/ipv4/tcp_vegas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 4376814d29fb..b7d296a8ac6d 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -236,7 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, /* We don't have enough RTT samples to do the Vegas * calculation, so we'll behave like Reno. */ - tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, cnt); + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); } else { u32 rtt, target_cwnd, diff; -- cgit From a2d7222f0f5861ce13b9308c30bd18f28ebeb583 Mon Sep 17 00:00:00 2001 From: Vlad Drukker Date: Sat, 12 Nov 2005 12:13:14 -0800 Subject: [NETFILTER] {ip,nf}_conntrack TCP: Accept SYN+PUSH like SYN Some devices (e.g. Qlogic iSCSI HBA hardware like QLA4010 up to firmware 3.0.0.4) initiates TCP with SYN and PUSH flags set. The Linux TCP/IP stack deals fine with that, but the connection tracking code doesn't. This patch alters TCP connection tracking to accept SYN+PUSH as a valid flag combination. Signed-off-by: Vlad Drukker Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 468c6003b4c7..5b3f5220f289 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -814,6 +814,7 @@ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, + [TH_SYN|TH_PUSH] = 1, [TH_SYN|TH_ACK|TH_PUSH] = 1, [TH_RST] = 1, [TH_RST|TH_ACK] = 1, -- cgit From dbd36ea496726460299842fdbeaaa7fff2f0c5c7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Nov 2005 15:21:01 -0800 Subject: [NETFILTER] ctnetlink: use size_t to make gcc-4.x happy Make gcc-4.x happy. Use size_t instead of int. Thanks to Patrick McHardy for the hint. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index d2a4fec22862..853d0ac5534f 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -467,7 +467,7 @@ out: } #endif -static const int cta_min_ip[CTA_IP_MAX] = { +static const size_t cta_min_ip[CTA_IP_MAX] = { [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), [CTA_IP_V4_DST-1] = sizeof(u_int32_t), }; @@ -497,7 +497,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) return 0; } -static const int cta_min_proto[CTA_PROTO_MAX] = { +static const size_t cta_min_proto[CTA_PROTO_MAX] = { [CTA_PROTO_NUM-1] = sizeof(u_int16_t), [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), @@ -576,7 +576,7 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, } #ifdef CONFIG_IP_NF_NAT_NEEDED -static const int cta_min_protonat[CTA_PROTONAT_MAX] = { +static const size_t cta_min_protonat[CTA_PROTONAT_MAX] = { [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), }; -- cgit From 56558208521729fa6b2a0f12df22e1569dee297a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Nov 2005 15:22:11 -0800 Subject: [NETFILTER] ctnetlink: More thorough size checking of attributes Add missing size checks. Thanks Patrick McHardy for the hint. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 39 +++++++++++++++++++++++++++++ net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 7 ++++++ 2 files changed, 46 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 853d0ac5534f..f5e5e3158670 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -614,6 +614,11 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, return 0; } +static const size_t cta_min_nat[CTA_NAT_MAX] = { + [CTA_NAT_MINIP-1] = sizeof(u_int32_t), + [CTA_NAT_MAXIP-1] = sizeof(u_int32_t), +}; + static inline int ctnetlink_parse_nat(struct nfattr *cda[], const struct ip_conntrack *ct, struct ip_nat_range *range) @@ -627,6 +632,9 @@ ctnetlink_parse_nat(struct nfattr *cda[], nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) + return -EINVAL; + if (tb[CTA_NAT_MINIP-1]) range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); @@ -667,6 +675,14 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) return 0; } +static const size_t cta_min[CTA_MAX] = { + [CTA_STATUS-1] = sizeof(u_int32_t), + [CTA_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_MARK-1] = sizeof(u_int32_t), + [CTA_USE-1] = sizeof(u_int32_t), + [CTA_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -678,6 +694,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -760,6 +779,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, return 0; } + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); else if (cda[CTA_TUPLE_REPLY-1]) @@ -1047,6 +1069,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_MAX, cta_min)) + return -EINVAL; + if (cda[CTA_TUPLE_ORIG-1]) { err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); if (err < 0) @@ -1252,6 +1277,11 @@ out: return skb->len; } +static const size_t cta_min_exp[CTA_EXPECT_MAX] = { + [CTA_EXPECT_TIMEOUT-1] = sizeof(u_int32_t), + [CTA_EXPECT_ID-1] = sizeof(u_int32_t) +}; + static int ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) @@ -1263,6 +1293,9 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (nlh->nlmsg_flags & NLM_F_DUMP) { struct nfgenmsg *msg = NLMSG_DATA(nlh); u32 rlen; @@ -1333,6 +1366,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct ip_conntrack_helper *h; int err; + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (cda[CTA_EXPECT_TUPLE-1]) { /* delete a single expect by tuple */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); @@ -1462,6 +1498,9 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); + if (nfattr_bad_size(cda, CTA_EXPECT_MAX, cta_min_exp)) + return -EINVAL; + if (!cda[CTA_EXPECT_TUPLE-1] || !cda[CTA_EXPECT_MASK-1] || !cda[CTA_EXPECT_MASTER-1]) diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 5b3f5220f289..ee3b7d6c4d2e 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -357,6 +357,10 @@ nfattr_failure: return -1; } +static const size_t cta_min_tcp[CTA_PROTOINFO_TCP_MAX] = { + [CTA_PROTOINFO_TCP_STATE-1] = sizeof(u_int8_t), +}; + static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) { struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1]; @@ -369,6 +373,9 @@ static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct) nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr); + if (nfattr_bad_size(tb, CTA_PROTOINFO_TCP_MAX, cta_min_tcp)) + return -EINVAL; + if (!tb[CTA_PROTOINFO_TCP_STATE-1]) return -EINVAL; -- cgit From 37d2e7a20d745035b600f1a6be56cbb9c7259419 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 14 Nov 2005 15:24:59 -0800 Subject: [NETFILTER] nfnetlink: unconditionally require CAP_NET_ADMIN This patch unconditionally requires CAP_NET_ADMIN for all nfnetlink messages. It also removes the per-message cap_required field, since all existing subsystems use CAP_NET_ADMIN for all their messages anyway. Patrick McHardy owes me a beer if we ever need to re-introduce this. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index f5e5e3158670..de9f4464438d 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1543,29 +1543,22 @@ static struct notifier_block ctnl_notifier_exp = { static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_MAX, }, }; static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, - .cap_required = CAP_NET_ADMIN }, + .attr_count = CTA_EXPECT_MAX, }, }; static struct nfnetlink_subsystem ctnl_subsys = { -- cgit From 31f3426904e066f17e3f88c468a2f7c869ad4aac Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 15 Nov 2005 15:17:10 -0800 Subject: [TCP]: More spelling fixes. From Joe Perches Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 40a26b7157b4..bf2e23086bce 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -367,7 +367,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the - * non-timestamp case, we do not smoother things out + * non-timestamp case, we do not smooth things out * else with timestamps disabled convergence takes too * long. */ @@ -546,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase - * too slowly, when it should be increased fastly, decrease too fastly + * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) -- cgit From e7c8a41e817f381ac5c2a59ecc81b483bd68a7df Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Wed, 16 Nov 2005 12:55:37 -0800 Subject: [IPV4,IPV6]: replace handmade list with hlist in IPv{4,6} reassembly Both of ipq and frag_queue have *next and **prev, and they can be replaced with hlist. Thanks Arnaldo Carvalho de Melo for the suggestion. Signed-off-by: Yasuyuki Kozakai Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e7d26d9943c2..8ce0ce2ee48e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -71,7 +71,7 @@ struct ipfrag_skb_cb /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { - struct ipq *next; /* linked list pointers */ + struct hlist_node list; struct list_head lru_list; /* lru list member */ u32 user; u32 saddr; @@ -89,7 +89,6 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - struct ipq **pprev; int iif; struct timeval stamp; }; @@ -99,7 +98,7 @@ struct ipq { #define IPQ_HASHSZ 64 /* Per-bucket lock is easy to add now. */ -static struct ipq *ipq_hash[IPQ_HASHSZ]; +static struct hlist_head ipq_hash[IPQ_HASHSZ]; static DEFINE_RWLOCK(ipfrag_lock); static u32 ipfrag_hash_rnd; static LIST_HEAD(ipq_lru_list); @@ -107,9 +106,7 @@ int ip_frag_nqueues = 0; static __inline__ void __ipq_unlink(struct ipq *qp) { - if(qp->next) - qp->next->pprev = qp->pprev; - *qp->pprev = qp->next; + hlist_del(&qp->list); list_del(&qp->lru_list); ip_frag_nqueues--; } @@ -139,27 +136,18 @@ static void ipfrag_secret_rebuild(unsigned long dummy) get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); for (i = 0; i < IPQ_HASHSZ; i++) { struct ipq *q; + struct hlist_node *p, *n; - q = ipq_hash[i]; - while (q) { - struct ipq *next = q->next; + hlist_for_each_entry_safe(q, p, n, &ipq_hash[i], list) { unsigned int hval = ipqhashfn(q->id, q->saddr, q->daddr, q->protocol); if (hval != i) { - /* Unlink. */ - if (q->next) - q->next->pprev = q->pprev; - *q->pprev = q->next; + hlist_del(&q->list); /* Relink to new hash chain. */ - if ((q->next = ipq_hash[hval]) != NULL) - q->next->pprev = &q->next; - ipq_hash[hval] = q; - q->pprev = &ipq_hash[hval]; + hlist_add_head(&q->list, &ipq_hash[hval]); } - - q = next; } } write_unlock(&ipfrag_lock); @@ -310,14 +298,16 @@ out: static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) { struct ipq *qp; - +#ifdef CONFIG_SMP + struct hlist_node *n; +#endif write_lock(&ipfrag_lock); #ifdef CONFIG_SMP /* With SMP race we have to recheck hash table, because * such entry could be created on other cpu, while we * promoted read lock to write lock. */ - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == qp_in->id && qp->saddr == qp_in->saddr && qp->daddr == qp_in->daddr && @@ -337,10 +327,7 @@ static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) atomic_inc(&qp->refcnt); atomic_inc(&qp->refcnt); - if((qp->next = ipq_hash[hash]) != NULL) - qp->next->pprev = &qp->next; - ipq_hash[hash] = qp; - qp->pprev = &ipq_hash[hash]; + hlist_add_head(&qp->list, &ipq_hash[hash]); INIT_LIST_HEAD(&qp->lru_list); list_add_tail(&qp->lru_list, &ipq_lru_list); ip_frag_nqueues++; @@ -392,9 +379,10 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) __u8 protocol = iph->protocol; unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); struct ipq *qp; + struct hlist_node *n; read_lock(&ipfrag_lock); - for(qp = ipq_hash[hash]; qp; qp = qp->next) { + hlist_for_each_entry(qp, n, &ipq_hash[hash], list) { if(qp->id == id && qp->saddr == saddr && qp->daddr == daddr && -- cgit From bd6af700a7191f483f41706467033588f28c8877 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 17 Nov 2005 14:11:18 -0800 Subject: [TCP]: TCP highspeed build error There is a compile error that crept in with the last patch of TCP patches. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_highspeed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 82b3c189bd7d..63cf7e540847 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -111,7 +111,7 @@ static void hstcp_init(struct sock *sk) } static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, - u32 in_flight, u32 pkts_acked) + u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct hstcp *ca = inet_csk_ca(sk); -- cgit From 2fce76afdb067fa3e7f8ee33c9fe366bd65887ea Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Thu, 17 Nov 2005 15:06:47 -0800 Subject: [NETFILTER] ip_conntrack: fix ftp/irc/tftp helpers on ports >= 32768 Since we've converted the ftp/irc/tftp helpers to use the new module_parm_array() some time ago, we ware accidentially using signed data types - thus preventing those modules from being used on ports >= 32768. This patch fixes it by using 'ushort' module parameters. Thanks to Jan Nijs for reporting this bug. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_ftp.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_irc.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_tftp.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index d77d6b3f5f80..59e12b02b22c 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -29,9 +29,9 @@ static char *ftp_buffer; static DEFINE_SPINLOCK(ip_ftp_lock); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); static int loose; module_param(loose, int, 0600); diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 15457415a4f3..2dea1db14406 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -34,7 +34,7 @@ #include #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; static int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; @@ -52,7 +52,7 @@ EXPORT_SYMBOL_GPL(ip_nat_irc_hook); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); MODULE_LICENSE("GPL"); -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of IRC servers"); module_param(max_dcc_channels, int, 0400); MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c index a78736b8525d..d3c5a371f993 100644 --- a/net/ipv4/netfilter/ip_conntrack_tftp.c +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -26,9 +26,9 @@ MODULE_DESCRIPTION("tftp connection tracking helper"); MODULE_LICENSE("GPL"); #define MAX_PORTS 8 -static short ports[MAX_PORTS]; +static unsigned short ports[MAX_PORTS]; static int ports_c; -module_param_array(ports, short, &ports_c, 0400); +module_param_array(ports, ushort, &ports_c, 0400); MODULE_PARM_DESC(ports, "port numbers of tftp servers"); #if 0 -- cgit From c9e53cbe7ad6eabb3c7c5140b6127b4e5f9ee840 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 20 Nov 2005 21:09:00 -0800 Subject: [FIB_TRIE]: Don't show local table in /proc/net/route output Don't show local table to behave similar to fib_hash. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 66247f38b371..705e3ce86df9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2378,6 +2378,7 @@ static unsigned fib_flag_trans(int type, u32 mask, const struct fib_info *fi) */ static int fib_route_seq_show(struct seq_file *seq, void *v) { + const struct fib_trie_iter *iter = seq->private; struct leaf *l = v; int i; char bf[128]; @@ -2389,6 +2390,8 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } + if (iter->trie == trie_local) + return 0; if (IS_TNODE(l)) return 0; -- cgit From 2b8f2ff6f4c11fff9c3016b54fa261f522a54b70 Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Sun, 20 Nov 2005 21:09:55 -0800 Subject: [NETFILTER]: fixed dependencies between modules related with ip_conntrack - IP_NF_CONNTRACK_MARK is bool and depends on only IP_NF_CONNTRACK which is tristate. If a variable depends on IP_NF_CONNTRACK_MARK and doesn't care about IP_NF_CONNTRACK, it can be y. This must be avoided. - IP_NF_CT_ACCT has same problem. - IP_NF_TARGET_CLUSTERIP also depends on IP_NF_MANGLE. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 9d3c8b5f327e..0bc00528d888 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -440,7 +440,7 @@ config IP_NF_MATCH_COMMENT config IP_NF_MATCH_CONNMARK tristate 'Connection mark match support' depends on IP_NF_IPTABLES - depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `connmark' match, which allows you to match the connection mark value previously set for the session by `CONNMARK'. @@ -452,7 +452,7 @@ config IP_NF_MATCH_CONNMARK config IP_NF_MATCH_CONNBYTES tristate 'Connection byte/packet counter match support' depends on IP_NF_IPTABLES - depends on IP_NF_CT_ACCT || (NF_CT_ACCT && NF_CONNTRACK_IPV4) + depends on (IP_NF_CONNTRACK && IP_NF_CT_ACCT) || (NF_CT_ACCT && NF_CONNTRACK_IPV4) help This option adds a `connbytes' match, which allows you to match the number of bytes and/or packets for each direction within a connection. @@ -767,7 +767,7 @@ config IP_NF_TARGET_TTL config IP_NF_TARGET_CONNMARK tristate 'CONNMARK target support' depends on IP_NF_MANGLE - depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help This option adds a `CONNMARK' target, which allows one to manipulate the connection mark value. Similar to the MARK target, but @@ -779,8 +779,8 @@ config IP_NF_TARGET_CONNMARK config IP_NF_TARGET_CLUSTERIP tristate "CLUSTERIP target support (EXPERIMENTAL)" - depends on IP_NF_IPTABLES && EXPERIMENTAL - depends on IP_NF_CONNTRACK_MARK || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) + depends on IP_NF_MANGLE && EXPERIMENTAL + depends on (IP_NF_CONNTRACK && IP_NF_CONNTRACK_MARK) || (NF_CONNTRACK_MARK && NF_CONNTRACK_IPV4) help The CLUSTERIP target allows you to build load-balancing clusters of network servers without having a dedicated load-balancing -- cgit From 0ff60a45678e67b2547256a636fd00c1667ce4fa Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Tue, 22 Nov 2005 14:47:37 -0800 Subject: [IPV4]: Fix secondary IP addresses after promotion This patch fixes the problem with promoting aliases when: a) a single primary and > 1 secondary addresses b) multiple primary addresses each with at least one secondary address Based on earlier efforts from Brian Pomerantz , Patrick McHardy and Thomas Graf Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 40 ++++++++++++++++++++++++++++++---------- net/ipv4/fib_frontend.c | 2 +- 2 files changed, 31 insertions(+), 11 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 4ec4b2ca6ab1..04a6fe3e95a2 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -234,7 +234,10 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy) { struct in_ifaddr *promote = NULL; - struct in_ifaddr *ifa1 = *ifap; + struct in_ifaddr *ifa, *ifa1 = *ifap; + struct in_ifaddr *last_prim = in_dev->ifa_list; + struct in_ifaddr *prev_prom = NULL; + int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); @@ -243,18 +246,22 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { - struct in_ifaddr *ifa; struct in_ifaddr **ifap1 = &ifa1->ifa_next; while ((ifa = *ifap1) != NULL) { + if (!(ifa->ifa_flags & IFA_F_SECONDARY) && + ifa1->ifa_scope <= ifa->ifa_scope) + last_prim = ifa; + if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { ifap1 = &ifa->ifa_next; + prev_prom = ifa; continue; } - if (!IN_DEV_PROMOTE_SECONDARIES(in_dev)) { + if (!do_promote) { *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa); @@ -283,18 +290,31 @@ static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, */ rtmsg_ifa(RTM_DELADDR, ifa1); notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); - if (destroy) { - inet_free_ifa(ifa1); - if (!in_dev->ifa_list) - inetdev_destroy(in_dev); - } + if (promote) { + + if (prev_prom) { + prev_prom->ifa_next = promote->ifa_next; + promote->ifa_next = last_prim->ifa_next; + last_prim->ifa_next = promote; + } - if (promote && IN_DEV_PROMOTE_SECONDARIES(in_dev)) { - /* not sure if we should send a delete notify first? */ promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote); notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); + for (ifa = promote->ifa_next; ifa; ifa = ifa->ifa_next) { + if (ifa1->ifa_mask != ifa->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, ifa)) + continue; + fib_add_ifaddr(ifa); + } + + } + if (destroy) { + inet_free_ifa(ifa1); + + if (!in_dev->ifa_list) + inetdev_destroy(in_dev); } } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 2267c1fad879..882f88f6d13b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -407,7 +407,7 @@ static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); } -static void fib_add_ifaddr(struct in_ifaddr *ifa) +void fib_add_ifaddr(struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct net_device *dev = in_dev->dev; -- cgit From 00cb277a4a1fb76aafb2fb28aa99f30546e619c5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 22 Nov 2005 14:54:34 -0800 Subject: [NETFILTER] ctnetlink: Fix refcount leak ip_conntrack/nat_proto Remove proto == NULL checking since ip_conntrack_[nat_]proto_find_get always returns a valid pointer. Fix missing ip_conntrack_proto_put in some paths. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index de9f4464438d..6c18a2b6d5ce 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -59,11 +59,13 @@ ctnetlink_dump_tuples_proto(struct sk_buff *skb, NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + /* If no protocol helper is found, this function will return the + * generic protocol helper, so proto won't *ever* be NULL */ proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - if (likely(proto && proto->tuple_to_nfattr)) { + if (likely(proto->tuple_to_nfattr)) ret = proto->tuple_to_nfattr(skb, tuple); - ip_conntrack_proto_put(proto); - } + + ip_conntrack_proto_put(proto); return ret; @@ -128,9 +130,11 @@ ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct) struct nfattr *nest_proto; int ret; - - if (!proto || !proto->to_nfattr) + + if (!proto->to_nfattr) { + ip_conntrack_proto_put(proto); return 0; + } nest_proto = NFA_NEST(skb, CTA_PROTOINFO); @@ -527,10 +531,10 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, proto = ip_conntrack_proto_find_get(tuple->dst.protonum); - if (likely(proto && proto->nfattr_to_tuple)) { + if (likely(proto->nfattr_to_tuple)) ret = proto->nfattr_to_tuple(tb, tuple); - ip_conntrack_proto_put(proto); - } + + ip_conntrack_proto_put(proto); return ret; } @@ -596,8 +600,6 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, return -EINVAL; npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); - if (!npt) - return 0; if (!npt->nfattr_to_range) { ip_nat_proto_put(npt); @@ -957,8 +959,6 @@ ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[]) nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr); proto = ip_conntrack_proto_find_get(npt); - if (!proto) - return -EINVAL; if (proto->from_nfattr) err = proto->from_nfattr(tb, ct); -- cgit From de919820cf7fe6674cdf47f8f47d2af284e4309f Mon Sep 17 00:00:00 2001 From: Benoit Boissinot Date: Wed, 23 Nov 2005 19:03:46 -0800 Subject: [NETFILTER]: ip_conntrack_netlink.c needs linux/interrupt.h net/ipv4/netfilter/ip_conntrack_netlink.c: In function 'ctnetlink_dump_table': net/ipv4/netfilter/ip_conntrack_netlink.c:409: warning: implicit declaration of function 'local_bh_disable' net/ipv4/netfilter/ip_conntrack_netlink.c:427: warning: implicit declaration of function 'local_bh_enable' Signed-off-by: Benoit Boissinot Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 6c18a2b6d5ce..3fce91bcc0ba 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include -- cgit From 18955cfcb2a5d75a08e0cb297f13ccfb6904de48 Mon Sep 17 00:00:00 2001 From: Mike Stroyan Date: Tue, 29 Nov 2005 16:12:55 -0800 Subject: [IPV4] tcp/route: Another look at hash table sizes The tcp_ehash hash table gets too big on systems with really big memory. It is worse on systems with pages larger than 4KB. It wastes memory that could be better used. It also makes the netstat command slow because reading /proc/net/tcp and /proc/net/tcp6 needs to go through the full hash table. The default value should not be larger for larger page sizes. It seems that the effect of page size is an unintended error dating back a long time. I also wonder if the default value really should be a larger fraction of memory for systems with more memory. While systems with really big ram can afford more space for hash tables, it is not clear to me that they benefit from increasing the allocation ratio for this table. The amount of memory allocated is determined by net/ipv4/tcp.c:tcp_init and mm/page_alloc.c:alloc_large_system_hash. tcp_init calls alloc_large_system_hash passing parameters- bucketsize=sizeof(struct tcp_ehash_bucket) numentries=thash_entries scale=(num_physpages >= 128 * 1024) ? (25-PAGE_SHIFT) : (27-PAGE_SHIFT) limit=0 On i386, PAGE_SHIFT is 12 for a page size of 4K On ia64, PAGE_SHIFT defaults to 14 for a page size of 16K The num_physpages test above makes the allocation take a larger fraction of the total memory on systems with larger memory. The threshold size for a i386 system is 512MB. For an ia64 system with 16KB pages the threshold is 2GB. For smaller memory systems- On i386, scale = (27 - 12) = 15 On ia64, scale = (27 - 14) = 13 For larger memory systems- On i386, scale = (25 - 12) = 13 On ia64, scale = (25 - 14) = 11 For the rest of this discussion, I'll just track the larger memory case. The default behavior has numentries=thash_entries=0, so the allocated size is determined by either scale or by the default limit of 1/16 of total memory. In alloc_large_system_hash- | numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | numentries >>= 20 - PAGE_SHIFT; | numentries <<= 20 - PAGE_SHIFT; At this point, numentries is pages for all of memory, rounded up to the nearest megabyte boundary. | /* limit to 1 bucket per 2^scale bytes of low memory */ | if (scale > PAGE_SHIFT) | numentries >>= (scale - PAGE_SHIFT); | else | numentries <<= (PAGE_SHIFT - scale); On i386, numentries >>= (13 - 12), so numentries is 1/8196 of bytes of total memory. On ia64, numentries <<= (14 - 11), so numentries is 1/2048 of bytes of total memory. | log2qty = long_log2(numentries); | | do { | size = bucketsize << log2qty; bucketsize is 16, so size is 16 times numentries, rounded down to a power of two. On i386, size is 1/512 of bytes of total memory. On ia64, size is 1/128 of bytes of total memory. For smaller systems the results are On i386, size is 1/2048 of bytes of total memory. On ia64, size is 1/512 of bytes of total memory. The large page effect can be removed by just replacing the use of PAGE_SHIFT with a constant of 12 in the calls to alloc_large_system_hash. That makes them more like the other uses of that function from fs/inode.c and fs/dcache.c Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +-- net/ipv4/tcp.c | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 381dd6a6aebb..e9c14f4a2eba 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3149,8 +3149,7 @@ int __init ip_rt_init(void) sizeof(struct rt_hash_bucket), rhash_entries, (num_physpages >= 128 * 1024) ? - (27 - PAGE_SHIFT) : - (29 - PAGE_SHIFT), + 15 : 17, HASH_HIGHMEM, &rt_hash_log, &rt_hash_mask, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9ac7a4f46bd8..5e6bc4b32875 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2065,8 +2065,7 @@ void __init tcp_init(void) sizeof(struct inet_ehash_bucket), thash_entries, (num_physpages >= 128 * 1024) ? - (25 - PAGE_SHIFT) : - (27 - PAGE_SHIFT), + 13 : 15, HASH_HIGHMEM, &tcp_hashinfo.ehash_size, NULL, @@ -2082,8 +2081,7 @@ void __init tcp_init(void) sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_size, (num_physpages >= 128 * 1024) ? - (25 - PAGE_SHIFT) : - (27 - PAGE_SHIFT), + 13 : 15, HASH_HIGHMEM, &tcp_hashinfo.bhash_size, NULL, -- cgit From 9b5b5cff9a6655dbb6d2e2be365bb95eec3950eb Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 29 Nov 2005 16:21:38 -0800 Subject: [NET]: Add const markers to various variables. the patch below marks various variables const in net/; the goal is to move them to the .rodata section so that they can't false-share cachelines with things that get written to, as well as potentially helping gcc a bit with optimisations. (these were found using a gcc patch to warn about such variables) Signed-off-by: Arjan van de Ven Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 2 +- net/ipv4/fib_semantics.c | 2 +- net/ipv4/icmp.c | 4 ++-- net/ipv4/ipvs/ip_vs_conn.c | 2 +- net/ipv4/ipvs/ip_vs_ctl.c | 4 ++-- net/ipv4/ipvs/ip_vs_proto_tcp.c | 2 +- net/ipv4/netfilter/ip_conntrack_amanda.c | 2 +- net/ipv4/netfilter/ip_conntrack_ftp.c | 2 +- net/ipv4/netfilter/ip_conntrack_irc.c | 2 +- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 6 +++--- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv4/netfilter/ipt_LOG.c | 2 +- net/ipv4/proc.c | 10 +++++----- net/ipv4/route.c | 2 +- net/ipv4/tcp.c | 2 +- 17 files changed, 27 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 2a8c9afc3695..7ea0209cb169 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -975,7 +975,7 @@ static void fib_seq_stop(struct seq_file *seq, void *v) static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi) { - static unsigned type2flags[RTN_MAX + 1] = { + static const unsigned type2flags[RTN_MAX + 1] = { [7] = RTF_REJECT, [8] = RTF_REJECT, }; unsigned flags = type2flags[type]; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 186f20c4a45e..6d2a6ac070e3 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -83,7 +83,7 @@ for (nhsel=0; nhsel < 1; nhsel++) #define endfor_nexthops(fi) } -static struct +static const struct { int error; u8 scope; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index e3eceecd0496..92e23b2ad4d2 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -220,7 +220,7 @@ struct icmp_control { short error; /* This ICMP is classed as an error message */ }; -static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; +static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; /* * The ICMP socket(s). This is the most convenient way to flow control @@ -994,7 +994,7 @@ error: /* * This table is the definition of how we handle ICMP. */ -static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { +static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { [ICMP_ECHOREPLY] = { .output_entry = ICMP_MIB_OUTECHOREPS, .input_entry = ICMP_MIB_INECHOREPS, diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index f828fa2eb7de..2a3a8c59c655 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -771,7 +771,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp) * The drop rate array needs tuning for real environments. * Called from timer bh only => no locking */ - static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; static char todrop_counter[9] = {0}; int i; diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 2d66848e7aa0..9bdcf31b760e 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1909,7 +1909,7 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) #define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) #define MAX_ARG_LEN SVCDEST_ARG_LEN -static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { +static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, @@ -2180,7 +2180,7 @@ __ip_vs_get_timeouts(struct ip_vs_timeout_user *u) #define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) #define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) -static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { +static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index c19408973c09..0e878fd6215c 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -251,7 +251,7 @@ tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) #define TCP_DIR_OUTPUT 4 #define TCP_DIR_INPUT_ONLY 8 -static int tcp_state_off[IP_VS_DIR_LAST] = { +static const int tcp_state_off[IP_VS_DIR_LAST] = { [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index fa3f914117ec..e52847fa10f5 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -37,7 +37,7 @@ MODULE_LICENSE("GPL"); module_param(master_timeout, int, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); -static char *conns[] = { "DATA ", "MESG ", "INDEX " }; +static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; /* This is slow, but it's simple. --RR */ static char *amanda_buffer; diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 59e12b02b22c..68b173bcda60 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -55,7 +55,7 @@ static int try_rfc959(const char *, size_t, u_int32_t [], char); static int try_eprt(const char *, size_t, u_int32_t [], char); static int try_epsv_response(const char *, size_t, u_int32_t [], char); -static struct ftp_search { +static const struct ftp_search { enum ip_conntrack_dir dir; const char *pattern; size_t plen; diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 2dea1db14406..d7c40421d0d1 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -59,7 +59,7 @@ MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC module_param(dcc_timeout, int, 0400); MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); -static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; +static const char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; #define MINMATCHLEN 5 #if 0 diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index e4d6b268e8c4..5f9925db608e 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -51,7 +51,7 @@ static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple, const struct ip_conntrack_tuple *orig) { /* Add 1; spaces filled with 0. */ - static u_int8_t invmap[] + static const u_int8_t invmap[] = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, [ICMP_ECHOREPLY] = ICMP_ECHO + 1, [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, @@ -110,7 +110,7 @@ static int icmp_packet(struct ip_conntrack *ct, return NF_ACCEPT; } -static u_int8_t valid_new[] = { +static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, [ICMP_INFO_REQUEST] = 1, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 59a4a0111dd3..977fb59d4563 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -65,7 +65,7 @@ static unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; static unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; -static unsigned long * sctp_timeouts[] +static const unsigned long * sctp_timeouts[] = { NULL, /* SCTP_CONNTRACK_NONE */ &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ @@ -118,7 +118,7 @@ cookie echoed to closed. */ /* SCTP conntrack state transitions */ -static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { +static const enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index ee3b7d6c4d2e..625981676776 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -99,7 +99,7 @@ unsigned long ip_ct_tcp_timeout_close = 10 SECS; to ~13-30min depending on RTO. */ unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS; -static unsigned long * tcp_timeouts[] +static const unsigned long * tcp_timeouts[] = { NULL, /* TCP_CONNTRACK_NONE */ &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ @@ -170,7 +170,7 @@ enum tcp_bit_set { * if they are invalid * or we do not support the request (simultaneous open) */ -static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { +static const enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ @@ -817,7 +817,7 @@ void ip_conntrack_tcp_update(struct sk_buff *skb, #define TH_CWR 0x80 /* table of valid flag combinations - ECE and CWR are always valid */ -static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_ACK] = 1, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 75c27e92f6ab..45886c8475e8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1892,7 +1892,7 @@ static int ipt_get_matches(char *buffer, char **start, off_t offset, int length) return pos; } -static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] = +static const struct { char *name; get_info_t *get_info; } ipt_proc_entry[] = { { "ip_tables_names", ipt_get_tables }, { "ip_tables_targets", ipt_get_targets }, { "ip_tables_matches", ipt_get_matches }, diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 92ed050fac69..180d6ca04af6 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -197,7 +197,7 @@ static void dump_packet(const struct nf_loginfo *info, } case IPPROTO_ICMP: { struct icmphdr _icmph, *ich; - static size_t required_len[NR_ICMP_TYPES+1] + static const size_t required_len[NR_ICMP_TYPES+1] = { [ICMP_ECHOREPLY] = 4, [ICMP_DEST_UNREACH] = 8 + sizeof(struct iphdr), diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index a65e508fbd40..0d7dc668db46 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -98,7 +98,7 @@ fold_field(void *mib[], int offt) } /* snmp items */ -static struct snmp_mib snmp4_ipstats_list[] = { +static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES), SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), @@ -119,7 +119,7 @@ static struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_icmp_list[] = { +static const struct snmp_mib snmp4_icmp_list[] = { SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS), SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS), SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS), @@ -149,7 +149,7 @@ static struct snmp_mib snmp4_icmp_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_tcp_list[] = { +static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), @@ -167,7 +167,7 @@ static struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_udp_list[] = { +static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), @@ -175,7 +175,7 @@ static struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_SENTINEL }; -static struct snmp_mib snmp4_net_list[] = { +static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e9c14f4a2eba..f701a136a6ae 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1371,7 +1371,7 @@ out: kfree_skb(skb); * are needed for AMPRnet AX.25 paths. */ -static unsigned short mtu_plateau[] = +static const unsigned short mtu_plateau[] = {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; static __inline__ unsigned short guess_mtu(unsigned short old_mtu) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5e6bc4b32875..ef98b14ac56d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1413,7 +1413,7 @@ recv_urg: * closed. */ -static unsigned char new_state[16] = { +static const unsigned char new_state[16] = { /* current state: new state: action: */ /* (Invalid) */ TCP_CLOSE, /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, -- cgit From 4b30b1c6a3e58dc74f2dbb0aa39f16a23cfcdd56 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 29 Nov 2005 16:27:20 -0800 Subject: [IPV4]: make two functions static This patch makes two needlessly global functions static. Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4e9c74b54b15..a4c347c3b8e3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1217,7 +1217,7 @@ static int ipgre_tunnel_init(struct net_device *dev) return 0; } -int __init ipgre_fb_tunnel_init(struct net_device *dev) +static int __init ipgre_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; struct iphdr *iph = &tunnel->parms.iph; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 11c2f68254f0..eba64e2bd397 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -690,7 +690,7 @@ csum_page(struct page *page, int offset, int copy) return csum; } -inline int ip_ufo_append_data(struct sock *sk, +static inline int ip_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, -- cgit From d127e94a5cf1c9c996b8c67cd2595b96c3e35e4c Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 29 Nov 2005 16:28:18 -0800 Subject: [NETFILTER] ipv4: small cleanups This patch contains the following cleanups: - make needlessly global code static - ip_conntrack_core.c: ip_conntrack_flush() -> ip_conntrack_flush(void) Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 4 ++-- net/ipv4/netfilter/ip_nat_core.c | 2 +- net/ipv4/netfilter/ipt_LOG.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 422ab68ee7fb..7a4ecddd597b 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1354,7 +1354,7 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) get_order(sizeof(struct list_head) * size)); } -void ip_conntrack_flush() +void ip_conntrack_flush(void) { /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module @@ -1408,7 +1408,7 @@ static struct list_head *alloc_hashtable(int size, int *vmalloced) return hash; } -int set_hashsize(const char *val, struct kernel_param *kp) +static int set_hashsize(const char *val, struct kernel_param *kp) { int i, bucket, hashsize, vmalloced; int old_vmalloced, old_size; diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 762f4d93936b..c1a61462507f 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -49,7 +49,7 @@ static unsigned int ip_nat_htable_size; static struct list_head *bysource; #define MAX_IP_NAT_PROTO 256 -struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; +static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; static inline struct ip_nat_protocol * __ip_nat_proto_find(u_int8_t protonum) diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 180d6ca04af6..30be0f1dae37 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -351,7 +351,7 @@ static void dump_packet(const struct nf_loginfo *info, /* maxlen = 230+ 91 + 230 + 252 = 803 */ } -struct nf_loginfo default_loginfo = { +static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { -- cgit From 73f306024c15bd12e59677d6eaf43ecced614f04 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Thu, 1 Dec 2005 14:28:58 -0800 Subject: [NETFILTER]: Ignore ACKs ACKs on half open connections in TCP conntrack Mounting NFS file systems after a (warm) reboot could take a long time if firewalling and connection tracking was enabled. The reason is that the NFS clients tends to use the same ports (800 and counting down). Now on reboot, the server would still have a TCB for an existing TCP connection client:800 -> server:2049. The client sends a SYN from port 800 to server:2049, which elicits an ACK from the server. The firewall on the client drops the ACK because (from its point of view) the connection is still in half-open state, and it expects to see a SYNACK. The client will eventually time out after several minutes. The following patch corrects this, by accepting ACKs on half open connections as well. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 625981676776..aeb7353d4777 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -272,9 +272,9 @@ static const enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* - * sSS -> sIV Might be a half-open connection. + * sSS -> sIG Might be a half-open connection. * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -917,8 +917,12 @@ static int tcp_packet(struct ip_conntrack *conntrack, switch (new_state) { case TCP_CONNTRACK_IGNORE: - /* Either SYN in ORIGINAL - * or SYN/ACK in REPLY. */ + /* Ignored packets: + * + * a) SYN in ORIGINAL + * b) SYN/ACK in REPLY + * c) ACK in reply direction after initial SYN in original. + */ if (index == TCP_SYNACK_SET && conntrack->proto.tcp.last_index == TCP_SYN_SET && conntrack->proto.tcp.last_dir != dir @@ -985,13 +989,20 @@ static int tcp_packet(struct ip_conntrack *conntrack, } case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET - && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET) + || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { - /* RST sent to invalid SYN we had let trough - * SYN was in window then, tear down connection. + /* RST sent to invalid SYN or ACK we had let trough + * at a) and c) above: + * + * a) SYN was in window then + * c) we hold a half-open connection. + * + * Delete our connection entry. * We skip window checking, because packet might ACK - * segments we ignored in the SYN. */ + * segments we ignored. */ goto in_window; } /* Just fall trough */ -- cgit From 2a43c4af3fa2e701008d51c28365e26fccf9cbb0 Mon Sep 17 00:00:00 2001 From: Phil Oester Date: Thu, 1 Dec 2005 14:29:24 -0800 Subject: [NETFILTER]: Fix recent match jiffies wrap mismatches Around jiffies wrap time (i.e. within first 5 mins after boot), recent match rules which contain both --seconds and --hitcount arguments experience false matches. This is because the last_pkts array is filled with zeros on creation, and when comparing 'now' to 0 (+ --seconds argument), time_before_eq thinks it has found a hit. Below patch adds a break if the packet value is zero. This has the unfortunate side effect of causing mismatches if a packet was received when jiffies really was equal to zero. The odds of that happening are slim compared to the problems caused by not adding the break however. Plus, the author used this same method just below, so it is "good enough". This fixes netfilter bugs #383 and #395. Signed-off-by: Phil Oester Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 2d44b07688af..261cbb4d4c49 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -532,6 +532,7 @@ match(const struct sk_buff *skb, } if(info->seconds && info->hit_count) { for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(r_list[location].last_pkts[pkt_count] == 0) break; if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; } if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; -- cgit From ea86575eaf99a9262a969309d934318028dbfacb Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 1 Dec 2005 14:30:00 -0800 Subject: [NETLINK]: Fix processing of fib_lookup netlink messages The receive path for fib_lookup netlink messages is lacking sanity checks for header and payload and is thus vulnerable to malformed netlink messages causing illegal memory references. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 882f88f6d13b..19b1b984d687 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -544,12 +544,16 @@ static void nl_fib_input(struct sock *sk, int len) struct sk_buff *skb = NULL; struct nlmsghdr *nlh = NULL; struct fib_result_nl *frn; - int err; u32 pid; struct fib_table *tb; - skb = skb_recv_datagram(sk, 0, 0, &err); + skb = skb_dequeue(&sk->sk_receive_queue); nlh = (struct nlmsghdr *)skb->data; + if (skb->len < NLMSG_SPACE(0) || skb->len < nlh->nlmsg_len || + nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*frn))) { + kfree_skb(skb); + return; + } frn = (struct fib_result_nl *) NLMSG_DATA(nlh); tb = fib_get_table(frn->tb_id_in); -- cgit From 24c6927505ca77ee4ac25fb31dcd56f6506979ed Mon Sep 17 00:00:00 2001 From: David Stevens Date: Fri, 2 Dec 2005 20:32:59 -0800 Subject: [IGMP]: workaround for IGMP v1/v2 bug From: David Stevens As explained at: http://www.cs.ucsb.edu/~krishna/igmp_dos/ With IGMP version 1 and 2 it is possible to inject a unicast report to a client which will make it ignore multicast reports sent later by the router. The fix is to only accept the report if is was sent to a multicast or unicast address. Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c04607b49212..4a195c724f01 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -897,7 +897,10 @@ int igmp_rcv(struct sk_buff *skb) /* Is it our report looped back? */ if (((struct rtable*)skb->dst)->fl.iif == 0) break; - igmp_heard_report(in_dev, ih->group); + /* don't rely on MC router hearing unicast reports */ + if (skb->pkt_type == PACKET_MULTICAST || + skb->pkt_type == PACKET_BROADCAST) + igmp_heard_report(in_dev, ih->group); break; case IGMP_PIM: #ifdef CONFIG_IP_PIMSM_V1 -- cgit From 86c8f9d158f68538a971a47206a46a22c7479bac Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Dec 2005 20:43:26 -0800 Subject: [IPV4] Fix EPROTONOSUPPORT error in inet_create There is a coding error in inet_create that causes it to always return ESOCKTNOSUPPORT. It should return EPROTONOSUPPORT when there are protocols registered for a given socket type but none of them match the requested protocol. This is based on a patch by Jayachandran C. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eaa150c33b04..d368cf249000 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -228,13 +228,14 @@ static int inet_create(struct socket *sock, int protocol) unsigned char answer_flags; char answer_no_check; int try_loading_module = 0; - int err = -ESOCKTNOSUPPORT; + int err; sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ answer = NULL; lookup_protocol: + err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_rcu(p, &inetsw[sock->type]) { answer = list_entry(p, struct inet_protosw, list); @@ -252,6 +253,7 @@ lookup_protocol: if (IPPROTO_IP == answer->protocol) break; } + err = -EPROTONOSUPPORT; answer = NULL; } @@ -280,9 +282,6 @@ lookup_protocol: err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; - err = -EPROTONOSUPPORT; - if (!protocol) - goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; -- cgit From 8d1ca69984ed1e5930c0537b8f606c54007d7319 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 5 Dec 2005 13:32:14 -0800 Subject: [NETFILTER]: Fix incorrect argument to ip_nat_initialized() in ctnetlink ip_nat_initialized() takes enum ip_nat_manip_type as it's second argument, not a hook number. Noticed and initial patch by Marcus Sundberg . Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 3fce91bcc0ba..70402e0ed00a 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -877,7 +877,7 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) DEBUGP("NAT status: %lu\n", status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - if (ip_nat_initialized(ct, hooknum)) + if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) return -EEXIST; ip_nat_setup_info(ct, &range, hooknum); -- cgit From afe5c6bb034bfa5824f8e7def6a739653e8f4655 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:33:50 -0800 Subject: [NETFILTER]: Fix ip_conntrack_flush abuse in ctnetlink ip_conntrack_flush() used to be part of ip_conntrack_cleanup(), which needs to drop _all_ references on module unload. Table flushed using ctnetlink just needs to clean the table and doesn't need to flush the event cache or wait for any references attached to skbs. Move everything but pure table flushing back to ip_conntrack_cleanup(). Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 7a4ecddd597b..84c66dbfedaf 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1345,6 +1345,11 @@ static int kill_all(struct ip_conntrack *i, void *data) return 1; } +void ip_conntrack_flush(void) +{ + ip_ct_iterate_cleanup(kill_all, NULL); +} + static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) { if (vmalloced) @@ -1354,8 +1359,12 @@ static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) get_order(sizeof(struct list_head) * size)); } -void ip_conntrack_flush(void) +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) { + ip_ct_attach = NULL; + /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ @@ -1363,7 +1372,7 @@ void ip_conntrack_flush(void) ip_ct_event_cache_flush(); i_see_dead_people: - ip_ct_iterate_cleanup(kill_all, NULL); + ip_conntrack_flush(); if (atomic_read(&ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; @@ -1371,14 +1380,7 @@ void ip_conntrack_flush(void) /* wait until all references to ip_conntrack_untracked are dropped */ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) schedule(); -} -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) -{ - ip_ct_attach = NULL; - ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, -- cgit From 0be7fa92ca162bf5e7993c392e6f93909d617bbb Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:34:51 -0800 Subject: [NETFILTER]: Fix CTA_PROTO_NUM attribute size in ctnetlink CTA_PROTO_NUM is a u_int8_t. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 70402e0ed00a..d058ac41bfd3 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -503,7 +503,7 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) } static const size_t cta_min_proto[CTA_PROTO_MAX] = { - [CTA_PROTO_NUM-1] = sizeof(u_int16_t), + [CTA_PROTO_NUM-1] = sizeof(u_int8_t), [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), @@ -528,7 +528,7 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, if (!tb[CTA_PROTO_NUM-1]) return -EINVAL; - tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + tuple->dst.protonum = *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); proto = ip_conntrack_proto_find_get(tuple->dst.protonum); -- cgit From a79575633300adb5d3f1bd856cc518c45fefcb86 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:36:25 -0800 Subject: [NETFILTER]: Mark ctnetlink as EXPERIMENTAL Should have been marked EXPERIMENTAL from the beginning, as the current bunch of fixes show. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 0bc00528d888..88a60650e6b8 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -56,8 +56,8 @@ config IP_NF_CONNTRACK_MARK instead of the individual packets. config IP_NF_CONNTRACK_EVENTS - bool "Connection tracking events" - depends on IP_NF_CONNTRACK + bool "Connection tracking events (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_CONNTRACK help If this option is enabled, the connection tracking code will provide a notifier chain that can be used by other kernel code @@ -66,8 +66,8 @@ config IP_NF_CONNTRACK_EVENTS IF unsure, say `N'. config IP_NF_CONNTRACK_NETLINK - tristate 'Connection tracking netlink interface' - depends on IP_NF_CONNTRACK && NETFILTER_NETLINK + tristate 'Connection tracking netlink interface (EXPERIMENTAL)' + depends on EXPERIMENTAL && IP_NF_CONNTRACK && NETFILTER_NETLINK depends on IP_NF_CONNTRACK!=y || NETFILTER_NETLINK!=m help This option enables support for a netlink-based userspace interface -- cgit From 266c8543480e2202ab63d1d604a5ca049f350cd8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:37:33 -0800 Subject: [NETFILTER]: Fix unbalanced read_unlock_bh in ctnetlink NFA_NEST calls NFA_PUT which jumps to nfattr_failure if the skb has no room left. We call read_unlock_bh at nfattr_failure for the NFA_PUT inside the locked section, so move NFA_NEST inside the locked section too. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index aeb7353d4777..e7fa29e576dc 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -341,9 +341,10 @@ static int tcp_print_conntrack(struct seq_file *s, static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, const struct ip_conntrack *ct) { - struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); + struct nfattr *nest_parms; read_lock_bh(&tcp_lock); + nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP); NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), &ct->proto.tcp.state); read_unlock_bh(&tcp_lock); -- cgit From 2fdf1faa8e33082d691bcba18814276f2bd5a6f0 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 5 Dec 2005 13:38:16 -0800 Subject: [NETFILTER]: Don't use conntrack entry after dropping the reference Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index d058ac41bfd3..91fe8f2e38ff 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -728,11 +728,9 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, return -ENOENT; } } - if (del_timer(&ct->timeout)) { - ip_conntrack_put(ct); + if (del_timer(&ct->timeout)) ct->timeout.function((unsigned long)ct); - return 0; - } + ip_conntrack_put(ct); DEBUGP("leaving\n"); -- cgit From 5b4956138173cb8b58d83d3173360e8e681a2b66 Mon Sep 17 00:00:00 2001 From: Thomas Young Date: Tue, 6 Dec 2005 16:16:34 -0800 Subject: [TCP] Vegas: stop resetting rtt every ack Move the resetting of rtt measurements to inside the once per RTT block of code. Signed-off-by: Thomas Young Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index b7d296a8ac6d..8f06a479305b 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -333,11 +333,11 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, else if (tp->snd_cwnd > tp->snd_cwnd_clamp) tp->snd_cwnd = tp->snd_cwnd_clamp; } - } - /* Wipe the slate clean for the next RTT. */ - vegas->cntRTT = 0; - vegas->minRTT = 0x7fffffff; + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } } /* Extract info for Tcp socket info provided via netlink. */ -- cgit From 0d7bef600acab393898bd5553e167496587da3e1 Mon Sep 17 00:00:00 2001 From: Thomas Young Date: Tue, 6 Dec 2005 16:17:11 -0800 Subject: [TCP] Vegas: Remove extra call to tcp_vegas_rtt_calc Remove unneeded call to tcp_vegas_rtt_calc. The more accurate microsecond value has already been registered prior to calling tcp_vegas_cong_avoid. Signed-off-by: Thomas Young Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 8f06a479305b..13e7e6e8df16 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -215,14 +215,6 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, vegas->beg_snd_nxt = tp->snd_nxt; vegas->beg_snd_cwnd = tp->snd_cwnd; - /* Take into account the current RTT sample too, to - * decrease the impact of delayed acks. This double counts - * this sample since we count it for the next window as well, - * but that's not too awful, since we're taking the min, - * rather than averaging. - */ - tcp_vegas_rtt_calc(sk, seq_rtt * 1000); - /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. -- cgit From dfb4b9dceb35c567a595ae5e9d035cfda044a103 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 6 Dec 2005 16:24:52 -0800 Subject: [TCP] Vegas: timestamp before clone We have to store the congestion control timestamp on the SKB before we clone it, not after. Else we get no timestamping information at all. tcp_transmit_skb() has been reworked so that we can do the timestamp still in one spot, instead of at all the call sites. Problem discovered, and initial fix, from Tom Young . Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 233 +++++++++++++++++++++++++++----------------------- 1 file changed, 124 insertions(+), 109 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 029c70dfb585..b7325e0b406a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -262,122 +262,139 @@ static __inline__ u16 tcp_select_window(struct sock *sk) * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { - if (skb != NULL) { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - int tcp_header_size = tp->tcp_header_len; - struct tcphdr *th; - int sysctl_flags; - int err; + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet; + struct tcp_sock *tp; + struct tcp_skb_cb *tcb; + int tcp_header_size; + struct tcphdr *th; + int sysctl_flags; + int err; + + BUG_ON(!skb || !tcp_skb_pcount(skb)); + + /* If congestion control is doing timestamping, we must + * take such a timestamp before we potentially clone/copy. + */ + if (icsk->icsk_ca_ops->rtt_sample) + __net_timestamp(skb); + + if (likely(clone_it)) { + if (unlikely(skb_cloned(skb))) + skb = pskb_copy(skb, gfp_mask); + else + skb = skb_clone(skb, gfp_mask); + if (unlikely(!skb)) + return -ENOBUFS; + } - BUG_ON(!tcp_skb_pcount(skb)); + inet = inet_sk(sk); + tp = tcp_sk(sk); + tcb = TCP_SKB_CB(skb); + tcp_header_size = tp->tcp_header_len; #define SYSCTL_FLAG_TSTAMPS 0x1 #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 - /* If congestion control is doing timestamping */ - if (icsk->icsk_ca_ops->rtt_sample) - __net_timestamp(skb); - - sysctl_flags = 0; - if (tcb->flags & TCPCB_FLAG_SYN) { - tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if(sysctl_tcp_timestamps) { - tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_TSTAMPS; - } - if(sysctl_tcp_window_scaling) { - tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_WSCALE; - } - if(sysctl_tcp_sack) { - sysctl_flags |= SYSCTL_FLAG_SACK; - if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; - } - } else if (tp->rx_opt.eff_sacks) { - /* A SACK is 2 pad bytes, a 2 byte header, plus - * 2 32-bit sequence numbers for each SACK block. - */ - tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); + sysctl_flags = 0; + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) { + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } - - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(sk, CA_EVENT_TX_START); - - th = (struct tcphdr *) skb_push(skb, tcp_header_size); - skb->h.th = th; - skb_set_owner_w(skb, sk); - - /* Build TCP header and checksum it. */ - th->source = inet->sport; - th->dest = inet->dport; - th->seq = htonl(tcb->seq); - th->ack_seq = htonl(tp->rcv_nxt); - *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); - if (tcb->flags & TCPCB_FLAG_SYN) { - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - th->window = htons(tp->rcv_wnd); - } else { - th->window = htons(tcp_select_window(sk)); + if (sysctl_tcp_window_scaling) { + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_WSCALE; } - th->check = 0; - th->urg_ptr = 0; - - if (tp->urg_mode && - between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) { - th->urg_ptr = htons(tp->snd_up-tcb->seq); - th->urg = 1; + if (sysctl_tcp_sack) { + sysctl_flags |= SYSCTL_FLAG_SACK; + if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } + } else if (unlikely(tp->rx_opt.eff_sacks)) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK)); + } + + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); + + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = inet->sport; + th->dest = inet->dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | + tcb->flags); + + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(tp->rcv_wnd); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = 0; - if (tcb->flags & TCPCB_FLAG_SYN) { - tcp_syn_build_options((__u32 *)(th + 1), - tcp_advertise_mss(sk), - (sysctl_flags & SYSCTL_FLAG_TSTAMPS), - (sysctl_flags & SYSCTL_FLAG_SACK), - (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rx_opt.rcv_wscale, - tcb->when, - tp->rx_opt.ts_recent); - } else { - tcp_build_and_update_options((__u32 *)(th + 1), - tp, tcb->when); + if (unlikely(tp->urg_mode && + between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { + th->urg_ptr = htons(tp->snd_up-tcb->seq); + th->urg = 1; + } - TCP_ECN_send(sk, tp, skb, tcp_header_size); - } - tp->af_specific->send_check(sk, th, skb->len, skb); + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { + tcp_syn_build_options((__u32 *)(th + 1), + tcp_advertise_mss(sk), + (sysctl_flags & SYSCTL_FLAG_TSTAMPS), + (sysctl_flags & SYSCTL_FLAG_SACK), + (sysctl_flags & SYSCTL_FLAG_WSCALE), + tp->rx_opt.rcv_wscale, + tcb->when, + tp->rx_opt.ts_recent); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, tcb->when); + TCP_ECN_send(sk, tp, skb, tcp_header_size); + } - if (tcb->flags & TCPCB_FLAG_ACK) - tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); + tp->af_specific->send_check(sk, th, skb->len, skb); - if (skb->len != tcp_header_size) - tcp_event_data_sent(tp, skb, sk); + if (likely(tcb->flags & TCPCB_FLAG_ACK)) + tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb, sk); - err = tp->af_specific->queue_xmit(skb, 0); - if (err <= 0) - return err; + TCP_INC_STATS(TCP_MIB_OUTSEGS); - tcp_enter_cwr(sk); + err = tp->af_specific->queue_xmit(skb, 0); + if (unlikely(err <= 0)) + return err; + + tcp_enter_cwr(sk); + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; - /* NET_XMIT_CN is special. It does not guarantee, - * that this packet is lost. It tells that device - * is about to start to drop packets or already - * drops some packets of the same priority and - * invokes us to send less aggressively. - */ - return err == NET_XMIT_CN ? 0 : err; - } - return -ENOBUFS; #undef SYSCTL_FLAG_TSTAMPS #undef SYSCTL_FLAG_WSCALE #undef SYSCTL_FLAG_SACK @@ -1036,7 +1053,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))) + if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) break; /* Advance the send_head. This one is sent out. @@ -1109,7 +1126,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) { + if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { update_send_head(sk, tp, skb); tcp_cwnd_validate(sk, tp); return; @@ -1429,9 +1446,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, (skb_cloned(skb) ? - pskb_copy(skb, GFP_ATOMIC): - skb_clone(skb, GFP_ATOMIC))); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (err == 0) { /* Update global TCP statistics. */ @@ -1665,7 +1680,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (tcp_transmit_skb(sk, skb)) + if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); } @@ -1700,7 +1715,7 @@ int tcp_send_synack(struct sock *sk) TCP_ECN_send_synack(tcp_sk(sk), skb); } TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } /* @@ -1861,7 +1876,7 @@ int tcp_connect(struct sock *sk) __skb_queue_tail(&sk->sk_write_queue, buff); sk_charge_skb(sk, buff); tp->packets_out += tcp_skb_pcount(buff); - tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ @@ -1957,7 +1972,7 @@ void tcp_send_ack(struct sock *sk) /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff); + tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } } @@ -1997,7 +2012,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; - return tcp_transmit_skb(sk, skb); + return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } int tcp_write_wakeup(struct sock *sk) @@ -2030,7 +2045,7 @@ int tcp_write_wakeup(struct sock *sk) TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); if (!err) { update_send_head(sk, tp, skb); } -- cgit From 2f9616d4c44349c903bc1b54fe46ab0ce0210b74 Mon Sep 17 00:00:00 2001 From: Marcus Sundberg Date: Mon, 12 Dec 2005 15:02:48 -0800 Subject: [NETFILTER]: ip_nat_tftp: Fix expectation NAT When a TFTP client is SNATed so that the port is also changed, the port is never changed back for the expected connection. Signed-off-by: Marcus Sundberg Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_nat_tftp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c index 2215317c76b7..43c3bd7c118e 100644 --- a/net/ipv4/netfilter/ip_nat_tftp.c +++ b/net/ipv4/netfilter/ip_nat_tftp.c @@ -42,7 +42,10 @@ static unsigned int help(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, struct ip_conntrack_expect *exp) { - exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; + struct ip_conntrack *ct = exp->master; + + exp->saved_proto.udp.port + = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; exp->dir = IP_CT_DIR_REPLY; exp->expectfn = ip_nat_follow_master; if (ip_conntrack_expect_related(exp) != 0) -- cgit From 1542272a60ab9c0655a13ead8b7d7a661365f9fb Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 14 Dec 2005 12:55:24 -0800 Subject: [GRE]: Fix hardware checksum modification The skb_postpull_rcsum introduced a bug to the checksum modification. Although the length pulled is offset bytes, the origin of the pulling is the GRE header, not the IP header. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a4c347c3b8e3..46f9d9cf7a5f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -618,7 +618,7 @@ static int ipgre_rcv(struct sk_buff *skb) skb->mac.raw = skb->nh.raw; skb->nh.raw = __pskb_pull(skb, offset); - skb_postpull_rcsum(skb, skb->mac.raw, offset); + skb_postpull_rcsum(skb, skb->h.raw, offset); memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); skb->pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST -- cgit From 0476f171affa6eca62021fca2ae9f5140acc3713 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 19 Dec 2005 13:53:09 -0800 Subject: [NETFILTER]: Fix NAT init order As noticed by Phil Oester, the GRE NAT protocol helper is initialized before the NAT core, which makes registration fail. Change the linking order to make NAT be initialized first. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 058c48e258fc..d0a447e520a2 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -12,6 +12,7 @@ ip_nat_pptp-objs := ip_nat_helper_pptp.o ip_nat_proto_gre.o # connection tracking obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o +obj-$(CONFIG_IP_NF_NAT) += ip_nat.o # conntrack netlink interface obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o @@ -41,7 +42,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o -obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o ip_nat.o +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o # matches -- cgit From 9e999993c71e1506378d26d81f842277aff8a250 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 19 Dec 2005 14:03:46 -0800 Subject: [XFRM]: Handle DCCP in xfrm{4,6}_decode_session Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/xfrm4_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index b2b60f3e9cdd..42196ba3b0b9 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -182,6 +182,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) case IPPROTO_UDP: case IPPROTO_TCP: case IPPROTO_SCTP: + case IPPROTO_DCCP: if (pskb_may_pull(skb, xprth + 4 - skb->data)) { u16 *ports = (u16 *)xprth; -- cgit