diff options
Diffstat (limited to 'net/core/sysctl_net_core.c')
| -rw-r--r-- | net/core/sysctl_net_core.c | 311 |
1 files changed, 214 insertions, 97 deletions
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5b1ce656baa1..8d4decb2606f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -16,19 +16,26 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/sched/isolation.h> #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> +#include <net/hotdata.h> +#include <net/proto_memory.h> +#include <net/rps.h> #include "dev.h" +#include "net-sysfs.h" static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; +static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; +static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -45,8 +52,89 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); +#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) +static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, + struct cpumask *mask) +{ + char *kbuf; + int len; + + if (*ppos || !*lenp) { + *lenp = 0; + return 0; + } + + /* CPUs are displayed as a hex bitmap + a comma between each groups of 8 + * nibbles (except the last one which has a newline instead). + * Guesstimate the buffer size at the group granularity level. + */ + len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp); + kbuf = kmalloc(len, GFP_KERNEL); + if (!kbuf) { + *lenp = 0; + return -ENOMEM; + } + + len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); + if (!len) { + *lenp = 0; + goto free_buf; + } + + /* scnprintf writes a trailing null char not counted in the returned + * length, override it with a newline. + */ + kbuf[len++] = '\n'; + memcpy(buffer, kbuf, len); + *lenp = len; + *ppos += len; + +free_buf: + kfree(kbuf); + return 0; +} +#endif + #ifdef CONFIG_RPS -static int rps_sock_flow_sysctl(struct ctl_table *table, int write, + +DEFINE_MUTEX(rps_default_mask_mutex); + +static int rps_default_mask_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = (struct net *)table->data; + struct cpumask *mask; + int err = 0; + + mutex_lock(&rps_default_mask_mutex); + mask = net->core.rps_default_mask; + if (write) { + if (!mask) { + mask = kzalloc(cpumask_size(), GFP_KERNEL); + net->core.rps_default_mask = mask; + } + err = -ENOMEM; + if (!mask) + goto done; + + err = cpumask_parse(buffer, mask); + if (err) + goto done; + + err = rps_cpumask_housekeeping(mask); + if (err) + goto done; + } else { + err = dump_cpumask(buffer, lenp, ppos, + mask ?: cpu_none_mask); + } + +done: + mutex_unlock(&rps_default_mask_mutex); + return err; +} + +static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; @@ -61,7 +149,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + orig_sock_table = rcu_dereference_protected( + net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; @@ -82,7 +171,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_unlock(&sock_flow_mutex); return -ENOMEM; } - rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + net_hotdata.rps_cpu_mask = + roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; @@ -93,7 +183,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, sock_table = NULL; if (sock_table != orig_sock_table) { - rcu_assign_pointer(rps_sock_flow_table, sock_table); + rcu_assign_pointer(net_hotdata.rps_sock_flow_table, + sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); @@ -101,7 +192,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu(orig_sock_table); + kvfree_rcu(orig_sock_table, rcu); } } } @@ -115,7 +206,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); -static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, +static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; @@ -139,7 +230,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - kfree_rcu(cur); + kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); @@ -148,20 +239,13 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, ret = -ENOMEM; goto write_unlock; } - cur->num_buckets = netdev_flow_limit_table_len; + cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { - char kbuf[128]; - - if (*ppos || !*lenp) { - *lenp = 0; - goto done; - } - cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { @@ -171,17 +255,7 @@ write_unlock: } rcu_read_unlock(); - len = min(sizeof(kbuf) - 1, *lenp); - len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); - if (!len) { - *lenp = 0; - goto done; - } - if (len < *lenp) - kbuf[len++] = '\n'; - memcpy(buffer, kbuf, len); - *lenp = len; - *ppos += len; + ret = dump_cpumask(buffer, lenp, ppos, mask); } done: @@ -189,7 +263,7 @@ done: return ret; } -static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, +static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; @@ -211,7 +285,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_SCHED -static int set_default_qdisc(struct ctl_table *table, int write, +static int set_default_qdisc(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; @@ -230,25 +304,25 @@ static int set_default_qdisc(struct ctl_table *table, int write, } #endif -static int proc_do_dev_weight(struct ctl_table *table, int write, +static int proc_do_dev_weight(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { static DEFINE_MUTEX(dev_weight_mutex); int ret, weight; mutex_lock(&dev_weight_mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); - WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); - WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); return ret; } -static int proc_do_rss_key(struct ctl_table *table, int write, +static int proc_do_rss_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; @@ -261,7 +335,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write, } #ifdef CONFIG_BPF_JIT -static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, +static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -294,7 +368,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, # ifdef CONFIG_HAVE_EBPF_JIT static int -proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, +proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) @@ -305,7 +379,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, # endif /* CONFIG_HAVE_EBPF_JIT */ static int -proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, +proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) @@ -317,36 +391,12 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, static struct ctl_table net_core_table[] = { { - .procname = "wmem_max", - .data = &sysctl_wmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_max", - .data = &sysctl_rmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, - { - .procname = "wmem_default", - .data = &sysctl_wmem_default, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_default", - .data = &sysctl_rmem_default, + .procname = "mem_pcpu_rsv", + .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, + .extra1 = &min_mem_pcpu_rsv, }, { .procname = "dev_weight", @@ -354,6 +404,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_rx_bias", @@ -361,6 +412,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_tx_bias", @@ -368,10 +420,11 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "netdev_max_backlog", - .data = &netdev_max_backlog, + .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -430,7 +483,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_tstamp_prequeue", - .data = &netdev_tstamp_prequeue, + .data = &net_hotdata.tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -449,22 +502,6 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "optmem_max", - .data = &sysctl_optmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tstamp_allow_data", - .data = &sysctl_tstamp_allow_data, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE - }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", @@ -515,7 +552,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_budget", - .data = &netdev_budget, + .data = &net_hotdata.netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -529,7 +566,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "max_skb_frags", - .data = &sysctl_max_skb_frags, + .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -538,11 +575,11 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_budget_usecs", - .data = &netdev_budget_usecs, + .data = &net_hotdata.netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = &netdev_budget_usecs_min, }, { .procname = "fb_tunnels_only_for_init_net", @@ -571,7 +608,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "gro_normal_batch", - .data = &gro_normal_batch, + .data = &net_hotdata.gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -588,16 +625,23 @@ static struct ctl_table net_core_table[] = { }, { .procname = "skb_defer_max", - .data = &sysctl_skb_defer_max, + .data = &net_hotdata.sysctl_skb_defer_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, }, - { } }; static struct ctl_table netns_core_table[] = { +#if IS_ENABLED(CONFIG_RPS) + { + .procname = "rps_default_mask", + .data = &init_net, + .mode = 0644, + .proc_handler = rps_default_mask_sysctl + }, +#endif { .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, @@ -607,6 +651,14 @@ static struct ctl_table netns_core_table[] = { .proc_handler = proc_dointvec_minmax }, { + .procname = "optmem_max", + .data = &init_net.core.sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .proc_handler = proc_dointvec_minmax + }, + { .procname = "txrehash", .data = &init_net.core.sysctl_txrehash, .maxlen = sizeof(u8), @@ -615,7 +667,66 @@ static struct ctl_table netns_core_table[] = { .extra2 = SYSCTL_ONE, .proc_handler = proc_dou8vec_minmax, }, - { } + { + .procname = "txq_reselection_ms", + .data = &init_net.core.sysctl_txq_reselection, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "tstamp_allow_data", + .data = &init_net.core.sysctl_tstamp_allow_data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + { + .procname = "bypass_prot_mem", + .data = &init_net.core.sysctl_bypass_prot_mem, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + /* sysctl_core_net_init() will set the values after this + * to readonly in network namespaces + */ + { + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, + { + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -633,24 +744,27 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { - struct ctl_table *tbl, *tmp; + size_t table_size = ARRAY_SIZE(netns_core_table); + struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { + int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; - for (tmp = tbl; tmp->procname; tmp++) - tmp->data += (char *)net - (char *)&init_net; + for (i = 0; i < table_size; ++i) { + if (tbl[i].data == &sysctl_wmem_max) + break; - /* Don't export any sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; + tbl[i].data += (char *)net - (char *)&init_net; } + for (; i < table_size; ++i) + tbl[i].mode &= ~0222; } - net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); + net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; @@ -665,11 +779,14 @@ err_dup: static __net_exit void sysctl_core_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); BUG_ON(tbl == netns_core_table); +#if IS_ENABLED(CONFIG_RPS) + kfree(net->core.rps_default_mask); +#endif kfree(tbl); } |
