diff options
Diffstat (limited to 'net/ipv4/tcp_metrics.c')
-rw-r--r-- | net/ipv4/tcp_metrics.c | 425 |
1 files changed, 378 insertions, 47 deletions
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0abe67bb4d3a..52f3c6b971d2 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1,13 +1,14 @@ #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/jiffies.h> -#include <linux/bootmem.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/tcp.h> #include <linux/hash.h> +#include <linux/tcp_metrics.h> +#include <linux/vmalloc.h> #include <net/inet_connection_sock.h> #include <net/net_namespace.h> @@ -17,20 +18,10 @@ #include <net/ipv6.h> #include <net/dst.h> #include <net/tcp.h> +#include <net/genetlink.h> int sysctl_tcp_nometrics_save __read_mostly; -enum tcp_metric_index { - TCP_METRIC_RTT, - TCP_METRIC_RTTVAR, - TCP_METRIC_SSTHRESH, - TCP_METRIC_CWND, - TCP_METRIC_REORDERING, - - /* Always last. */ - TCP_METRIC_MAX, -}; - struct tcp_fastopen_metrics { u16 mss; u16 syn_loss:10; /* Recurring Fast Open SYN losses */ @@ -45,8 +36,10 @@ struct tcp_metrics_block { u32 tcpm_ts; u32 tcpm_ts_stamp; u32 tcpm_lock; - u32 tcpm_vals[TCP_METRIC_MAX]; + u32 tcpm_vals[TCP_METRIC_MAX + 1]; struct tcp_fastopen_metrics tcpm_fastopen; + + struct rcu_head rcu_head; }; static bool tcp_metric_locked(struct tcp_metrics_block *tm, @@ -103,7 +96,8 @@ struct tcpm_hash_bucket { static DEFINE_SPINLOCK(tcp_metrics_lock); -static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) +static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, + bool fastopen_clear) { u32 val; @@ -129,9 +123,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); tm->tcpm_ts = 0; tm->tcpm_ts_stamp = 0; - tm->tcpm_fastopen.mss = 0; - tm->tcpm_fastopen.syn_loss = 0; - tm->tcpm_fastopen.cookie.len = 0; + if (fastopen_clear) { + tm->tcpm_fastopen.mss = 0; + tm->tcpm_fastopen.syn_loss = 0; + tm->tcpm_fastopen.cookie.len = 0; + } } static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, @@ -161,7 +157,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, } tm->tcpm_addr = *addr; - tcpm_suck_dst(tm, dst); + tcpm_suck_dst(tm, dst, true); if (likely(!reclaim)) { tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; @@ -178,7 +174,7 @@ out_unlock: static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) { if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) - tcpm_suck_dst(tm, dst); + tcpm_suck_dst(tm, dst, false); } #define TCP_METRICS_RECLAIM_DEPTH 5 @@ -447,7 +443,7 @@ void tcp_init_metrics(struct sock *sk) struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_metrics_block *tm; - u32 val; + u32 val, crtt = 0; /* cached RTT scaled by 8 */ if (dst == NULL) goto reset; @@ -482,15 +478,19 @@ void tcp_init_metrics(struct sock *sk) tp->reordering = val; } - val = tcp_metric_get(tm, TCP_METRIC_RTT); - if (val == 0 || tp->srtt == 0) { - rcu_read_unlock(); - goto reset; - } - /* Initial rtt is determined from SYN,SYN-ACK. - * The segment is small and rtt may appear much - * less than real one. Use per-dst memory - * to make it more realistic. + crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); + rcu_read_unlock(); +reset: + /* The initial RTT measurement from the SYN/SYN-ACK is not ideal + * to seed the RTO for later data packets because SYN packets are + * small. Use the per-dst cached values to seed the RTO but keep + * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). + * Later the RTO will be updated immediately upon obtaining the first + * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only + * influences the first RTO but not later RTT estimation. + * + * But if RTT is not available from the SYN (due to retransmits or + * syn cookies) or the cache, force a conservative 3secs timeout. * * A bit of theory. RTT is time passed after "normal" sized packet * is sent until it is ACKed. In normal circumstances sending small @@ -501,21 +501,11 @@ void tcp_init_metrics(struct sock *sk) * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ - val = msecs_to_jiffies(val); - if (val > tp->srtt) { - tp->srtt = val; - tp->rtt_seq = tp->snd_nxt; - } - val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); - if (val > tp->mdev) { - tp->mdev = val; - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); - } - rcu_read_unlock(); - - tcp_set_rto(sk); -reset: - if (tp->srtt == 0) { + if (crtt > tp->srtt) { + /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ + crtt >>= 3; + inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); + } else if (tp->srtt == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, * including spurious one. Reset the RTO back to 3secs @@ -690,6 +680,325 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, rcu_read_unlock(); } +static struct genl_family tcp_metrics_nl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = TCP_METRICS_GENL_NAME, + .version = TCP_METRICS_GENL_VERSION, + .maxattr = TCP_METRICS_ATTR_MAX, + .netnsok = true, +}; + +static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { + [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, + [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr), }, + /* Following attributes are not received for GET/DEL, + * we keep them for reference + */ +#if 0 + [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, }, + [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, }, + [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, }, + [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, }, + [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, }, + [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, }, + [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, }, + [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY, + .len = TCP_FASTOPEN_COOKIE_MAX, }, +#endif +}; + +/* Add attributes, caller cancels its header on failure */ +static int tcp_metrics_fill_info(struct sk_buff *msg, + struct tcp_metrics_block *tm) +{ + struct nlattr *nest; + int i; + + switch (tm->tcpm_addr.family) { + case AF_INET: + if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, + tm->tcpm_addr.addr.a4) < 0) + goto nla_put_failure; + break; + case AF_INET6: + if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, + tm->tcpm_addr.addr.a6) < 0) + goto nla_put_failure; + break; + default: + return -EAFNOSUPPORT; + } + + if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, + jiffies - tm->tcpm_stamp) < 0) + goto nla_put_failure; + if (tm->tcpm_ts_stamp) { + if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP, + (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0) + goto nla_put_failure; + if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL, + tm->tcpm_ts) < 0) + goto nla_put_failure; + } + + { + int n = 0; + + nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); + if (!nest) + goto nla_put_failure; + for (i = 0; i < TCP_METRIC_MAX + 1; i++) { + if (!tm->tcpm_vals[i]) + continue; + if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0) + goto nla_put_failure; + n++; + } + if (n) + nla_nest_end(msg, nest); + else + nla_nest_cancel(msg, nest); + } + + { + struct tcp_fastopen_metrics tfom_copy[1], *tfom; + unsigned int seq; + + do { + seq = read_seqbegin(&fastopen_seqlock); + tfom_copy[0] = tm->tcpm_fastopen; + } while (read_seqretry(&fastopen_seqlock, seq)); + + tfom = tfom_copy; + if (tfom->mss && + nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS, + tfom->mss) < 0) + goto nla_put_failure; + if (tfom->syn_loss && + (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS, + tfom->syn_loss) < 0 || + nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS, + jiffies - tfom->last_syn_loss) < 0)) + goto nla_put_failure; + if (tfom->cookie.len > 0 && + nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE, + tfom->cookie.len, tfom->cookie.val) < 0) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int tcp_metrics_dump_info(struct sk_buff *skb, + struct netlink_callback *cb, + struct tcp_metrics_block *tm) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &tcp_metrics_nl_family, NLM_F_MULTI, + TCP_METRICS_CMD_GET); + if (!hdr) + return -EMSGSIZE; + + if (tcp_metrics_fill_info(skb, tm) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int tcp_metrics_nl_dump(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; + unsigned int row, s_row = cb->args[0]; + int s_col = cb->args[1], col = s_col; + + for (row = s_row; row < max_rows; row++, s_col = 0) { + struct tcp_metrics_block *tm; + struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row; + + rcu_read_lock(); + for (col = 0, tm = rcu_dereference(hb->chain); tm; + tm = rcu_dereference(tm->tcpm_next), col++) { + if (col < s_col) + continue; + if (tcp_metrics_dump_info(skb, cb, tm) < 0) { + rcu_read_unlock(); + goto done; + } + } + rcu_read_unlock(); + } + +done: + cb->args[0] = row; + cb->args[1] = col; + return skb->len; +} + +static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, + unsigned int *hash, int optional) +{ + struct nlattr *a; + + a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4]; + if (a) { + addr->family = AF_INET; + addr->addr.a4 = nla_get_be32(a); + *hash = (__force unsigned int) addr->addr.a4; + return 0; + } + a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6]; + if (a) { + if (nla_len(a) != sizeof(struct in6_addr)) + return -EINVAL; + addr->family = AF_INET6; + memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); + *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); + return 0; + } + return optional ? 1 : -EAFNOSUPPORT; +} + +static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct tcp_metrics_block *tm; + struct inetpeer_addr addr; + unsigned int hash; + struct sk_buff *msg; + struct net *net = genl_info_net(info); + void *reply; + int ret; + + ret = parse_nl_addr(info, &addr, &hash, 0); + if (ret < 0) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0, + info->genlhdr->cmd); + if (!reply) + goto nla_put_failure; + + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + ret = -ESRCH; + rcu_read_lock(); + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + tm = rcu_dereference(tm->tcpm_next)) { + if (addr_same(&tm->tcpm_addr, &addr)) { + ret = tcp_metrics_fill_info(msg, tm); + break; + } + } + rcu_read_unlock(); + if (ret < 0) + goto out_free; + + genlmsg_end(msg, reply); + return genlmsg_reply(msg, info); + +nla_put_failure: + ret = -EMSGSIZE; + +out_free: + nlmsg_free(msg); + return ret; +} + +#define deref_locked_genl(p) \ + rcu_dereference_protected(p, lockdep_genl_is_held() && \ + lockdep_is_held(&tcp_metrics_lock)) + +#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held()) + +static int tcp_metrics_flush_all(struct net *net) +{ + unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; + struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; + struct tcp_metrics_block *tm; + unsigned int row; + + for (row = 0; row < max_rows; row++, hb++) { + spin_lock_bh(&tcp_metrics_lock); + tm = deref_locked_genl(hb->chain); + if (tm) + hb->chain = NULL; + spin_unlock_bh(&tcp_metrics_lock); + while (tm) { + struct tcp_metrics_block *next; + + next = deref_genl(tm->tcpm_next); + kfree_rcu(tm, rcu_head); + tm = next; + } + } + return 0; +} + +static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct tcpm_hash_bucket *hb; + struct tcp_metrics_block *tm; + struct tcp_metrics_block __rcu **pp; + struct inetpeer_addr addr; + unsigned int hash; + struct net *net = genl_info_net(info); + int ret; + + ret = parse_nl_addr(info, &addr, &hash, 1); + if (ret < 0) + return ret; + if (ret > 0) + return tcp_metrics_flush_all(net); + + hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hb = net->ipv4.tcp_metrics_hash + hash; + pp = &hb->chain; + spin_lock_bh(&tcp_metrics_lock); + for (tm = deref_locked_genl(*pp); tm; + pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) { + if (addr_same(&tm->tcpm_addr, &addr)) { + *pp = tm->tcpm_next; + break; + } + } + spin_unlock_bh(&tcp_metrics_lock); + if (!tm) + return -ESRCH; + kfree_rcu(tm, rcu_head); + return 0; +} + +static struct genl_ops tcp_metrics_nl_ops[] = { + { + .cmd = TCP_METRICS_CMD_GET, + .doit = tcp_metrics_nl_cmd_get, + .dumpit = tcp_metrics_nl_dump, + .policy = tcp_metrics_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = TCP_METRICS_CMD_DEL, + .doit = tcp_metrics_nl_cmd_del, + .policy = tcp_metrics_nl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + static unsigned int tcpmhash_entries; static int __init set_tcpmhash_entries(char *str) { @@ -722,7 +1031,10 @@ static int __net_init tcp_net_metrics_init(struct net *net) net->ipv4.tcp_metrics_hash_log = order_base_2(slots); size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; - net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL); + net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!net->ipv4.tcp_metrics_hash) + net->ipv4.tcp_metrics_hash = vzalloc(size); + if (!net->ipv4.tcp_metrics_hash) return -ENOMEM; @@ -743,7 +1055,10 @@ static void __net_exit tcp_net_metrics_exit(struct net *net) tm = next; } } - kfree(net->ipv4.tcp_metrics_hash); + if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash)) + vfree(net->ipv4.tcp_metrics_hash); + else + kfree(net->ipv4.tcp_metrics_hash); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { @@ -753,5 +1068,21 @@ static __net_initdata struct pernet_operations tcp_net_metrics_ops = { void __init tcp_metrics_init(void) { - register_pernet_subsys(&tcp_net_metrics_ops); + int ret; + + ret = register_pernet_subsys(&tcp_net_metrics_ops); + if (ret < 0) + goto cleanup; + ret = genl_register_family_with_ops(&tcp_metrics_nl_family, + tcp_metrics_nl_ops, + ARRAY_SIZE(tcp_metrics_nl_ops)); + if (ret < 0) + goto cleanup_subsys; + return; + +cleanup_subsys: + unregister_pernet_subsys(&tcp_net_metrics_ops); + +cleanup: + return; } |