diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/fib_lookup.h | 8 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 33 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 194 | ||||
-rw-r--r-- | net/ipv4/gre_offload.c | 2 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 14 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 8 | ||||
-rw-r--r-- | net/ipv4/route.c | 31 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp_cubic.c | 83 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 129 | ||||
-rw-r--r-- | net/ipv4/tcp_metrics.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 12 | ||||
-rw-r--r-- | net/ipv4/udp.c | 3 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 2 |
16 files changed, 413 insertions, 151 deletions
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a68b5e21ec51..c092e9a55790 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -16,6 +16,9 @@ struct fib_alias { u8 fa_slen; u32 tb_id; s16 fa_default; + u8 offload:1, + trap:1, + unused:6; struct rcu_head rcu; }; @@ -35,9 +38,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); -int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, - u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, - unsigned int); +int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f1888c683426..a803cdd9400a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -504,6 +504,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { + struct fib_rt_info fri; struct sk_buff *skb; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; int err = -ENOBUFS; @@ -512,9 +513,15 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, if (!skb) goto errout; - err = fib_dump_info(skb, info->portid, seq, event, tb_id, - fa->fa_type, key, dst_len, - fa->fa_tos, fa->fa_info, nlm_flags); + fri.fi = fa->fa_info; + fri.tb_id = tb_id; + fri.dst = key; + fri.dst_len = dst_len; + fri.tos = fa->fa_tos; + fri.type = fa->fa_type; + fri.offload = fa->offload; + fri.trap = fa->trap; + err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -1725,10 +1732,11 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, - u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, - struct fib_info *fi, unsigned int flags) + struct fib_rt_info *fri, unsigned int flags) { - unsigned int nhs = fib_info_num_path(fi); + unsigned int nhs = fib_info_num_path(fri->fi); + struct fib_info *fi = fri->fi; + u32 tb_id = fri->tb_id; struct nlmsghdr *nlh; struct rtmsg *rtm; @@ -1738,22 +1746,22 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET; - rtm->rtm_dst_len = dst_len; + rtm->rtm_dst_len = fri->dst_len; rtm->rtm_src_len = 0; - rtm->rtm_tos = tos; + rtm->rtm_tos = fri->tos; if (tb_id < 256) rtm->rtm_table = tb_id; else rtm->rtm_table = RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, tb_id)) goto nla_put_failure; - rtm->rtm_type = type; + rtm->rtm_type = fri->type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && - nla_put_in_addr(skb, RTA_DST, dst)) + nla_put_in_addr(skb, RTA_DST, fri->dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) @@ -1795,6 +1803,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; } + if (fri->offload) + rtm->rtm_flags |= RTM_F_OFFLOAD; + if (fri->trap) + rtm->rtm_flags |= RTM_F_TRAP; + nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 195469a13371..ff0c24371e33 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -980,9 +980,12 @@ static struct key_vector *fib_find_node(struct trie *t, /* Return the first fib alias matching TOS with * priority less than or equal to PRIO. + * If 'find_first' is set, return the first matching + * fib alias, regardless of TOS and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, - u8 tos, u32 prio, u32 tb_id) + u8 tos, u32 prio, u32 tb_id, + bool find_first) { struct fib_alias *fa; @@ -998,6 +1001,8 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, continue; if (fa->tb_id != tb_id) break; + if (find_first) + return fa; if (fa->fa_tos > tos) continue; if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) @@ -1007,6 +1012,52 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, return NULL; } +static struct fib_alias * +fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) +{ + u8 slen = KEYLENGTH - fri->dst_len; + struct key_vector *l, *tp; + struct fib_table *tb; + struct fib_alias *fa; + struct trie *t; + + tb = fib_get_table(net, fri->tb_id); + if (!tb) + return NULL; + + t = (struct trie *)tb->tb_data; + l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); + if (!l) + return NULL; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && + fa->fa_tos == fri->tos && fa->fa_info == fri->fi && + fa->fa_type == fri->type) + return fa; + } + + return NULL; +} + +void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) +{ + struct fib_alias *fa_match; + + rcu_read_lock(); + + fa_match = fib_find_matching_alias(net, fri); + if (!fa_match) + goto out; + + fa_match->offload = fri->offload; + fa_match->trap = fri->trap; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); + static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) @@ -1063,9 +1114,6 @@ noleaf: return -ENOMEM; } -/* fib notifier for ADD is sent before calling fib_insert_alias with - * the expectation that the only possible failure ENOMEM - */ static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) @@ -1118,11 +1166,13 @@ static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) return true; } +static void fib_remove_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *old); + /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { - enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; @@ -1149,7 +1199,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority, - tb->tb_id) : NULL; + tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,tos,priority], if such key already @@ -1216,19 +1266,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + new_fa->offload = 0; + new_fa->trap = 0; - err = call_fib_entry_notifiers(net, - FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa, - extack); - if (err) - goto out_free_new_fa; + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); + + if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, + tb->tb_id, true) == new_fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib_entry_notifiers(net, fib_event, + key, plen, + new_fa, extack); + if (err) { + hlist_replace_rcu(&new_fa->fa_list, + &fa->fa_list); + goto out_free_new_fa; + } + } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); - hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); - alias_free_mem_rcu(fa); fib_release_info(fi_drop); @@ -1244,12 +1304,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if (fa_match) goto out; - if (cfg->fc_nlflags & NLM_F_APPEND) { - event = FIB_EVENT_ENTRY_APPEND; + if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; - } else { + else fa = fa_first; - } } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) @@ -1268,15 +1326,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - - err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); - if (err) - goto out_free_new_fa; + new_fa->offload = 0; + new_fa->trap = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_fib_notif; + goto out_free_new_fa; + + /* The alias was already inserted, so the node must exist. */ + l = l ? l : fib_find_node(t, &tp, key); + if (WARN_ON_ONCE(!l)) + goto out_free_new_fa; + + if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == + new_fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib_entry_notifiers(net, fib_event, key, plen, + new_fa, extack); + if (err) + goto out_remove_new_fa; + } if (!plen) tb->tb_num_default++; @@ -1287,14 +1359,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb, succeeded: return 0; -out_fib_notif: - /* notifier was sent that entry would be added to trie, but - * the add failed and need to recover. Only failure for - * fib_insert_alias is ENOMEM. - */ - NL_SET_ERR_MSG(extack, "Failed to insert route into trie"); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, - plen, new_fa, NULL); +out_remove_new_fa: + fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1545,6 +1611,36 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, node_pull_suffix(tp, fa->fa_slen); } +static void fib_notify_alias_delete(struct net *net, u32 key, + struct hlist_head *fah, + struct fib_alias *fa_to_delete, + struct netlink_ext_ack *extack) +{ + struct fib_alias *fa_next, *fa_to_notify; + u32 tb_id = fa_to_delete->tb_id; + u8 slen = fa_to_delete->fa_slen; + enum fib_event_type fib_event; + + /* Do not notify if we do not care about the route. */ + if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) + return; + + /* Determine if the route should be replaced by the next route in the + * list. + */ + fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, + struct fib_alias, fa_list); + if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { + fib_event = FIB_EVENT_ENTRY_REPLACE; + fa_to_notify = fa_next; + } else { + fib_event = FIB_EVENT_ENTRY_DEL; + fa_to_notify = fa_to_delete; + } + call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, + fa_to_notify, extack); +} + /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) @@ -1566,7 +1662,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!l) return -ESRCH; - fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id); + fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false); if (!fa) return -ESRCH; @@ -1598,8 +1694,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!fa_to_delete) return -ESRCH; - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete, extack); + fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1923,10 +2018,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) continue; } - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, - n->key, - KEYLENGTH - fa->fa_slen, fa, - NULL); + fib_notify_alias_delete(net, n->key, &n->leaf, fa, + NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -2022,6 +2115,7 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, struct netlink_ext_ack *extack) { struct fib_alias *fa; + int last_slen = -1; int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { @@ -2036,8 +2130,12 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, if (tb->tb_id != fa->tb_id) continue; - err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, + if (fa->fa_slen == last_slen) + continue; + + last_slen = fa->fa_slen; + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, + l->key, KEYLENGTH - fa->fa_slen, fa, extack); if (err) return err; @@ -2146,14 +2244,20 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, if (filter->dump_routes) { if (!s_fa) { + struct fib_rt_info fri; + + fri.fi = fi; + fri.tb_id = tb->tb_id; + fri.dst = xkey; + fri.dst_len = KEYLENGTH - fa->fa_slen; + fri.tos = fa->fa_tos; + fri.type = fa->fa_type; + fri.offload = fa->offload; + fri.trap = fa->trap; err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, fa->fa_type, - xkey, - KEYLENGTH - fa->fa_slen, - fa->fa_tos, fi, flags); + RTM_NEWROUTE, &fri, flags); if (err < 0) goto stop; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 4de7e962d3da..2e6d1b7a7bc9 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -174,7 +174,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head, if (skb_gro_checksum_simple_validate(skb)) goto out_unlock; - skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0, + skb_gro_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 18c0d5bffe12..6a691fd04398 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -770,6 +770,18 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, + const gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(newsk); + + if (!icsk->icsk_ulp_ops) + return; + + if (icsk->icsk_ulp_ops->clone) + icsk->icsk_ulp_ops->clone(req, newsk, priority); +} + /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone @@ -810,6 +822,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + inet_clone_ulp(req, newsk, priority); + security_inet_csk_clone(newsk, req); } return newsk; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 14db1e0b8a6e..d84819893db9 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -240,8 +240,8 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { + struct sk_buff *segs, *nskb; netdev_features_t features; - struct sk_buff *segs; int ret = 0; /* common case: seglen is <= mtu @@ -272,8 +272,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, consume_skb(skb); - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); @@ -281,8 +280,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, if (err && ret == 0) ret = err; - segs = nskb; - } while (segs); + } return ret; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 87e979f2b74a..2010888e68ca 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3223,16 +3223,41 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { + struct fib_rt_info fri; + if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } + fri.fi = res.fi; + fri.tb_id = table_id; + fri.dst = res.prefix; + fri.dst_len = res.prefixlen; + fri.tos = fl4.flowi4_tos; + fri.type = rt->rt_type; + fri.offload = 0; + fri.trap = 0; + if (res.fa_head) { + struct fib_alias *fa; + + hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { + u8 slen = 32 - fri.dst_len; + + if (fa->fa_slen == slen && + fa->tb_id == fri.tb_id && + fa->fa_tos == fri.tos && + fa->fa_info == res.fi && + fa->fa_type == fri.type) { + fri.offload = fa->offload; + fri.trap = fa->trap; + break; + } + } + } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, RTM_NEWROUTE, table_id, - rt->rt_type, res.prefix, res.prefixlen, - fl4.flowi4_tos, res.fi, 0); + nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index fcb2cd167f64..9684af02e0a5 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1193,6 +1193,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "tcp_no_ssthresh_metrics_save", + .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d885ba868822..6711a97de3ce 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -443,8 +443,6 @@ void tcp_init_sock(struct sock *sk) tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; - sk->sk_state = TCP_CLOSE; - sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); @@ -692,8 +690,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } -static void tcp_push(struct sock *sk, int flags, int mss_now, - int nonagle, int size_goal) +void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -927,7 +925,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, return max(size_goal, mss_now); } -static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; @@ -1778,6 +1776,8 @@ static int tcp_zerocopy_receive(struct sock *sk, while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { if (skb) { + if (zc->recv_skip_hint > 0) + break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 1b3d032a4df2..8f8eefd3a3ce 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -40,8 +40,8 @@ /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 -#define HYSTART_DELAY_MIN (4U<<3) -#define HYSTART_DELAY_MAX (16U<<3) +#define HYSTART_DELAY_MIN (4000U) /* 4 ms */ +#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) static int fast_convergence __read_mostly = 1; @@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; -static int hystart_ack_delta __read_mostly = 2; +static int hystart_ack_delta_us __read_mostly = 2000; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; @@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); -module_param(hystart_ack_delta, int, 0644); -MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); +module_param(hystart_ack_delta_us, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); /* BIC TCP Parameters */ struct bictcp { @@ -89,7 +89,7 @@ struct bictcp { u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ - u32 delay_min; /* min delay (msec << 3) */ + u32 delay_min; /* min delay (usec) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ @@ -117,13 +117,9 @@ static inline void bictcp_reset(struct bictcp *ca) ca->found = 0; } -static inline u32 bictcp_clock(void) +static inline u32 bictcp_clock_us(const struct sock *sk) { -#if HZ < 1000 - return ktime_to_ms(ktime_get_real()); -#else - return jiffies_to_msecs(jiffies); -#endif + return tcp_sk(sk)->tcp_mstamp; } static inline void bictcp_hystart_reset(struct sock *sk) @@ -131,9 +127,9 @@ static inline void bictcp_hystart_reset(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - ca->round_start = ca->last_ack = bictcp_clock(); + ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; - ca->curr_rtt = 0; + ca->curr_rtt = ~0U; ca->sample_cnt = 0; } @@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) */ t = (s32)(tcp_jiffies32 - ca->epoch_start); - t += msecs_to_jiffies(ca->delay_min >> 3); + t += usecs_to_jiffies(ca->delay_min); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; do_div(t, HZ); @@ -376,22 +372,54 @@ static void bictcp_state(struct sock *sk, u8 new_state) } } +/* Account for TSO/GRO delays. + * Otherwise short RTT flows could get too small ssthresh, since during + * slow start we begin with small TSO packets and ca->delay_min would + * not account for long aggregation delay when TSO packets get bigger. + * Ideally even with a very small RTT we would like to have at least one + * TSO packet being sent and received by GRO, and another one in qdisc layer. + * We apply another 100% factor because @rate is doubled at this point. + * We cap the cushion to 1ms. + */ +static u32 hystart_ack_delay(struct sock *sk) +{ + unsigned long rate; + + rate = READ_ONCE(sk->sk_pacing_rate); + if (!rate) + return 0; + return min_t(u64, USEC_PER_MSEC, + div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); +} + static void hystart_update(struct sock *sk, u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - - if (ca->found & hystart_detect) - return; + u32 threshold; if (hystart_detect & HYSTART_ACK_TRAIN) { - u32 now = bictcp_clock(); + u32 now = bictcp_clock_us(sk); /* first detection parameter - ack-train detection */ - if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { - ca->found |= HYSTART_ACK_TRAIN; + + threshold = ca->delay_min + hystart_ack_delay(sk); + + /* Hystart ack train triggers if we get ack past + * ca->delay_min/2. + * Pacing might have delayed packets up to RTT/2 + * during slow start. + */ + if (sk->sk_pacing_status == SK_PACING_NONE) + threshold >>= 1; + + if ((s32)(now - ca->round_start) > threshold) { + ca->found = 1; + pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", + now - ca->round_start, threshold, + ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), @@ -405,14 +433,14 @@ static void hystart_update(struct sock *sk, u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { - if (ca->curr_rtt == 0 || ca->curr_rtt > delay) + if (ca->curr_rtt > delay) ca->curr_rtt = delay; ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { - ca->found |= HYSTART_DELAY; + ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), @@ -424,9 +452,6 @@ static void hystart_update(struct sock *sk, u32 delay) } } -/* Track delayed acknowledgment ratio using sliding window - * ratio = (15*ratio + sample) / 16 - */ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); @@ -441,7 +466,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; - delay = (sample->rtt_us << 3) / USEC_PER_MSEC; + delay = sample->rtt_us; if (delay == 0) delay = 1; @@ -450,7 +475,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (hystart && tcp_in_slow_start(tp) && + if (!ca->found && tcp_in_slow_start(tp) && hystart && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5347ab2c9c58..358365598216 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1423,7 +1423,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; - if (!tcp_skb_can_collapse_to(prev)) + if (!tcp_skb_can_collapse(prev, skb)) goto fallback; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && @@ -3554,7 +3554,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) return; - if (unlikely(rexmit == 2)) { + if (unlikely(rexmit == REXMIT_NEW)) { __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); if (after(tp->snd_nxt, tp->high_seq)) @@ -4424,6 +4424,9 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; + if (!mptcp_skb_can_collapse(to, from)) + return false; + #ifdef CONFIG_TLS_DEVICE if (from->decrypted != to->decrypted) return false; @@ -4933,7 +4936,7 @@ restart: /* The first skb to collapse is: * - not SYN/FIN and * - bloated or contains data before "start" or - * overlaps to the next one. + * overlaps to the next one and mptcp allow collapsing. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(sk, skb->truesize) > skb->len || @@ -4942,7 +4945,7 @@ restart: break; } - if (n && n != tail && + if (n && n != tail && mptcp_skb_can_collapse(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; @@ -4975,6 +4978,7 @@ restart: else __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); + mptcp_skb_ext_move(nskb, skb); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { @@ -4994,6 +4998,7 @@ restart: skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || + !mptcp_skb_can_collapse(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; #ifdef CONFIG_TLS_DEVICE diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1c7326e04f9b..fedb537839ec 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -701,9 +701,21 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, AF_INET); + const union tcp_md5_addr *addr; + int l3index; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and inet_iif is set to it. + */ + l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } else if (hash_location) { + const union tcp_md5_addr *addr; + int sdif = tcp_v4_sdif(skb); + int dif = inet_iif(skb); + int l3index; + /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. @@ -714,14 +726,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, - ntohs(th->source), inet_iif(skb), - tcp_v4_sdif(skb)); + ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ if (!sk1) goto out; - key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, AF_INET); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to it. + */ + l3index = sdif ? dif : 0; + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); if (!key) goto out; @@ -905,6 +920,9 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { + const union tcp_md5_addr *addr; + int l3index; + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ @@ -916,14 +934,15 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * exception of <SYN> segments, MUST be right-shifted by * Rcv.Wind.Shift bits: */ + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, - AF_INET), + tcp_md5_do_lookup(sk, l3index, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); } @@ -983,7 +1002,7 @@ DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); EXPORT_SYMBOL(tcp_md5_needed); /* Find the Key structure for an address. */ -struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { @@ -1003,7 +1022,8 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; - + if (key->l3index && key->l3index != l3index) + continue; if (family == AF_INET) { mask = inet_make_mask(key->prefixlen); match = (key->addr.a4.s_addr & mask) == @@ -1027,7 +1047,8 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen) + int family, u8 prefixlen, + int l3index) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1046,6 +1067,8 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; + if (key->l3index && key->l3index != l3index) + continue; if (!memcmp(&key->addr, addr, size) && key->prefixlen == prefixlen) return key; @@ -1057,23 +1080,26 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; + int l3index; + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), + addr_sk->sk_bound_dev_if); addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; - return tcp_md5_do_lookup(sk, addr, AF_INET); + return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } EXPORT_SYMBOL(tcp_v4_md5_lookup); /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, - gfp_t gfp) + int family, u8 prefixlen, int l3index, + const u8 *newkey, u8 newkeylen, gfp_t gfp) { /* Add Key to the list */ struct tcp_md5sig_key *key; struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (key) { /* Pre-existing entry - just update that one. */ memcpy(key->key, newkey, newkeylen); @@ -1105,6 +1131,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key->keylen = newkeylen; key->family = family; key->prefixlen = prefixlen; + key->l3index = l3index; memcpy(&key->addr, addr, (family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); @@ -1114,11 +1141,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, EXPORT_SYMBOL(tcp_md5_do_add); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, - u8 prefixlen) + u8 prefixlen, int l3index) { struct tcp_md5sig_key *key; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (!key) return -ENOENT; hlist_del_rcu(&key->node); @@ -1149,7 +1176,9 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; + const union tcp_md5_addr *addr; u8 prefixlen = 32; + int l3index = 0; if (optlen < sizeof(cmd)) return -EINVAL; @@ -1167,16 +1196,34 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; } + if (optname == TCP_MD5SIG_EXT && + cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); + if (dev && netif_is_l3_master(dev)) + l3index = dev->ifindex; + + rcu_read_unlock(); + + /* ok to reference set/not set outside of rcu; + * right now device MUST be an L3 master + */ + if (!dev || !l3index) + return -EINVAL; + } + + addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; + if (!cmd.tcpm_keylen) - return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, prefixlen); + return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, - GFP_KERNEL); + return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, + cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, @@ -1286,7 +1333,8 @@ EXPORT_SYMBOL(tcp_v4_md5_hash_skb); /* Called with rcu_read_lock() */ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, + int dif, int sdif) { #ifdef CONFIG_TCP_MD5SIG /* @@ -1301,11 +1349,17 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, struct tcp_md5sig_key *hash_expected; const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - int genhash; + const union tcp_md5_addr *addr; unsigned char newhash[16]; + int genhash, l3index; - hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, - AF_INET); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + addr = (union tcp_md5_addr *)&iph->saddr; + hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ @@ -1331,11 +1385,11 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", + net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", &iph->saddr, ntohs(th->source), &iph->daddr, ntohs(th->dest), genhash ? " tcp_v4_calc_md5_hash failed" - : ""); + : "", l3index); return true; } return false; @@ -1372,7 +1426,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v4_md5_lookup, @@ -1419,7 +1473,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG + const union tcp_md5_addr *addr; struct tcp_md5sig_key *key; + int l3index; #endif struct ip_options_rcu *inet_opt; @@ -1467,9 +1523,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); /* Copy over the MD5 key from the original socket */ - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET); + addr = (union tcp_md5_addr *)&newinet->inet_daddr; + key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key) { /* * We're using one, so create a matching key @@ -1477,8 +1534,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); + tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, + key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } #endif @@ -1808,6 +1865,7 @@ int tcp_v4_rcv(struct sk_buff *skb) struct net *net = dev_net(skb->dev); struct sk_buff *skb_to_free; int sdif = inet_sdif(skb); + int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; @@ -1856,7 +1914,7 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1914,7 +1972,7 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - if (tcp_v4_inbound_md5_hash(sk, skb)) + if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) goto discard_and_relse; nf_reset_ct(skb); @@ -2675,6 +2733,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; + net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; cnt = tcp_hashinfo.ehash_mask + 1; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c4848e7a0aad..279db8822439 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -385,7 +385,8 @@ void tcp_update_metrics(struct sock *sk) if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && (tp->snd_cwnd >> 1) > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, @@ -400,7 +401,8 @@ void tcp_update_metrics(struct sock *sk) } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { @@ -416,7 +418,8 @@ void tcp_update_metrics(struct sock *sk) tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_ssthresh) >> 1); } - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && tp->snd_ssthresh > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, @@ -441,6 +444,7 @@ void tcp_init_metrics(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ @@ -458,7 +462,8 @@ void tcp_init_metrics(struct sock *sk) if (tcp_metric_locked(tm, TCP_METRIC_CWND)) tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); - val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + val = net->ipv4.sysctl_tcp_no_ssthresh_metrics_save ? + 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) { tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 58c92a7d671c..05109d0c675b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -754,13 +754,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + + TCPOLEN_SACK_PERBLOCK)) + return size; + opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); - if (likely(opts->num_sack_blocks)) - size += TCPOLEN_SACK_BASE_ALIGNED + - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } return size; @@ -2865,7 +2869,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!tcp_can_collapse(sk, skb)) break; - if (!tcp_skb_can_collapse_to(to)) + if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 93a355b6b092..208da0917469 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2104,8 +2104,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET); __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, true); - for (skb = segs; skb; skb = next) { - next = skb->next; + skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a3908e55ed89..b25e42100ceb 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -480,7 +480,7 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) inet_gro_compute_pseudo)) goto flush; else if (uh->check) - skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; |