diff options
Diffstat (limited to 'net/core/sysctl_net_core.c')
| -rw-r--r-- | net/core/sysctl_net_core.c | 360 |
1 files changed, 250 insertions, 110 deletions
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7b4d485aac7a..8d4decb2606f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -16,21 +16,26 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/sched/isolation.h> #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> +#include <net/hotdata.h> +#include <net/proto_memory.h> +#include <net/rps.h> + +#include "dev.h" +#include "net-sysfs.h" -static int two = 2; -static int three = 3; static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; -static long long_one __maybe_unused = 1; -static long long_max __maybe_unused = LONG_MAX; +static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; +static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -47,8 +52,89 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); +#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS) +static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos, + struct cpumask *mask) +{ + char *kbuf; + int len; + + if (*ppos || !*lenp) { + *lenp = 0; + return 0; + } + + /* CPUs are displayed as a hex bitmap + a comma between each groups of 8 + * nibbles (except the last one which has a newline instead). + * Guesstimate the buffer size at the group granularity level. + */ + len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp); + kbuf = kmalloc(len, GFP_KERNEL); + if (!kbuf) { + *lenp = 0; + return -ENOMEM; + } + + len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); + if (!len) { + *lenp = 0; + goto free_buf; + } + + /* scnprintf writes a trailing null char not counted in the returned + * length, override it with a newline. + */ + kbuf[len++] = '\n'; + memcpy(buffer, kbuf, len); + *lenp = len; + *ppos += len; + +free_buf: + kfree(kbuf); + return 0; +} +#endif + #ifdef CONFIG_RPS -static int rps_sock_flow_sysctl(struct ctl_table *table, int write, + +DEFINE_MUTEX(rps_default_mask_mutex); + +static int rps_default_mask_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = (struct net *)table->data; + struct cpumask *mask; + int err = 0; + + mutex_lock(&rps_default_mask_mutex); + mask = net->core.rps_default_mask; + if (write) { + if (!mask) { + mask = kzalloc(cpumask_size(), GFP_KERNEL); + net->core.rps_default_mask = mask; + } + err = -ENOMEM; + if (!mask) + goto done; + + err = cpumask_parse(buffer, mask); + if (err) + goto done; + + err = rps_cpumask_housekeeping(mask); + if (err) + goto done; + } else { + err = dump_cpumask(buffer, lenp, ppos, + mask ?: cpu_none_mask); + } + +done: + mutex_unlock(&rps_default_mask_mutex); + return err; +} + +static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; @@ -63,7 +149,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_lock(&sock_flow_mutex); - orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, + orig_sock_table = rcu_dereference_protected( + net_hotdata.rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; @@ -84,7 +171,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, mutex_unlock(&sock_flow_mutex); return -ENOMEM; } - rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; + net_hotdata.rps_cpu_mask = + roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; @@ -95,7 +183,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, sock_table = NULL; if (sock_table != orig_sock_table) { - rcu_assign_pointer(rps_sock_flow_table, sock_table); + rcu_assign_pointer(net_hotdata.rps_sock_flow_table, + sock_table); if (sock_table) { static_branch_inc(&rps_needed); static_branch_inc(&rfs_needed); @@ -103,8 +192,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - synchronize_rcu(); - vfree(orig_sock_table); + kvfree_rcu(orig_sock_table, rcu); } } } @@ -118,7 +206,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); -static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, +static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; @@ -142,8 +230,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - synchronize_rcu(); - kfree(cur); + kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); @@ -152,20 +239,13 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, ret = -ENOMEM; goto write_unlock; } - cur->num_buckets = netdev_flow_limit_table_len; + cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { - char kbuf[128]; - - if (*ppos || !*lenp) { - *lenp = 0; - goto done; - } - cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { @@ -175,17 +255,7 @@ write_unlock: } rcu_read_unlock(); - len = min(sizeof(kbuf) - 1, *lenp); - len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); - if (!len) { - *lenp = 0; - goto done; - } - if (len < *lenp) - kbuf[len++] = '\n'; - memcpy(buffer, kbuf, len); - *lenp = len; - *ppos += len; + ret = dump_cpumask(buffer, lenp, ppos, mask); } done: @@ -193,7 +263,7 @@ done: return ret; } -static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, +static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; @@ -215,7 +285,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_SCHED -static int set_default_qdisc(struct ctl_table *table, int write, +static int set_default_qdisc(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; @@ -234,22 +304,25 @@ static int set_default_qdisc(struct ctl_table *table, int write, } #endif -static int proc_do_dev_weight(struct ctl_table *table, int write, +static int proc_do_dev_weight(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret; - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret != 0) - return ret; - - dev_rx_weight = weight_p * dev_weight_rx_bias; - dev_tx_weight = weight_p * dev_weight_tx_bias; + static DEFINE_MUTEX(dev_weight_mutex); + int ret, weight; + + mutex_lock(&dev_weight_mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + weight = READ_ONCE(weight_p); + WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias); + } + mutex_unlock(&dev_weight_mutex); return ret; } -static int proc_do_rss_key(struct ctl_table *table, int write, +static int proc_do_rss_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; @@ -262,11 +335,13 @@ static int proc_do_rss_key(struct ctl_table *table, int write, } #ifdef CONFIG_BPF_JIT -static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, +static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; + int min = *(int *)table->extra1; + int max = *(int *)table->extra2; struct ctl_table tmp = *table; if (write && !capable(CAP_SYS_ADMIN)) @@ -284,12 +359,16 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, ret = -EPERM; } } + + if (write && ret && min == max) + pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n"); + return ret; } # ifdef CONFIG_HAVE_EBPF_JIT static int -proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, +proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) @@ -300,7 +379,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, # endif /* CONFIG_HAVE_EBPF_JIT */ static int -proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, +proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) @@ -312,36 +391,12 @@ proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, static struct ctl_table net_core_table[] = { { - .procname = "wmem_max", - .data = &sysctl_wmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_max", - .data = &sysctl_rmem_max, + .procname = "mem_pcpu_rsv", + .data = &net_hotdata.sysctl_mem_pcpu_rsv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, - }, - { - .procname = "wmem_default", - .data = &sysctl_wmem_default, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_sndbuf, - }, - { - .procname = "rmem_default", - .data = &sysctl_rmem_default, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &min_rcvbuf, + .extra1 = &min_mem_pcpu_rsv, }, { .procname = "dev_weight", @@ -349,6 +404,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_rx_bias", @@ -356,6 +412,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_tx_bias", @@ -363,10 +420,11 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "netdev_max_backlog", - .data = &netdev_max_backlog, + .data = &net_hotdata.max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -390,7 +448,7 @@ static struct ctl_table net_core_table[] = { .extra2 = SYSCTL_ONE, # else .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, # endif }, # ifdef CONFIG_HAVE_EBPF_JIT @@ -401,7 +459,7 @@ static struct ctl_table net_core_table[] = { .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "bpf_jit_kallsyms", @@ -419,13 +477,13 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(long), .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, - .extra1 = &long_one, + .extra1 = SYSCTL_LONG_ONE, .extra2 = &bpf_jit_limit_max, }, #endif { .procname = "netdev_tstamp_prequeue", - .data = &netdev_tstamp_prequeue, + .data = &net_hotdata.tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -444,22 +502,6 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "optmem_max", - .data = &sysctl_optmem_max, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tstamp_allow_data", - .data = &sysctl_tstamp_allow_data, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE - }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", @@ -510,7 +552,7 @@ static struct ctl_table net_core_table[] = { #endif { .procname = "netdev_budget", - .data = &netdev_budget, + .data = &net_hotdata.netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -524,7 +566,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "max_skb_frags", - .data = &sysctl_max_skb_frags, + .data = &net_hotdata.sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -533,11 +575,11 @@ static struct ctl_table net_core_table[] = { }, { .procname = "netdev_budget_usecs", - .data = &netdev_budget_usecs, + .data = &net_hotdata.netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = &netdev_budget_usecs_min, }, { .procname = "fb_tunnels_only_for_init_net", @@ -546,7 +588,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "devconf_inherit_init_net", @@ -555,7 +597,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &three, + .extra2 = SYSCTL_THREE, }, { .procname = "high_order_alloc_disable", @@ -566,7 +608,7 @@ static struct ctl_table net_core_table[] = { }, { .procname = "gro_normal_batch", - .data = &gro_normal_batch, + .data = &net_hotdata.gro_normal_batch, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -581,10 +623,25 @@ static struct ctl_table net_core_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, - { } + { + .procname = "skb_defer_max", + .data = &net_hotdata.sysctl_skb_defer_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, }; static struct ctl_table netns_core_table[] = { +#if IS_ENABLED(CONFIG_RPS) + { + .procname = "rps_default_mask", + .data = &init_net, + .mode = 0644, + .proc_handler = rps_default_mask_sysctl + }, +#endif { .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, @@ -593,7 +650,83 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .proc_handler = proc_dointvec_minmax }, - { } + { + .procname = "optmem_max", + .data = &init_net.core.sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .proc_handler = proc_dointvec_minmax + }, + { + .procname = "txrehash", + .data = &init_net.core.sysctl_txrehash, + .maxlen = sizeof(u8), + .mode = 0644, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + .proc_handler = proc_dou8vec_minmax, + }, + { + .procname = "txq_reselection_ms", + .data = &init_net.core.sysctl_txq_reselection, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "tstamp_allow_data", + .data = &init_net.core.sysctl_tstamp_allow_data, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + { + .procname = "bypass_prot_mem", + .data = &init_net.core.sysctl_bypass_prot_mem, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + /* sysctl_core_net_init() will set the values after this + * to readonly in network namespaces + */ + { + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, + { + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, }; static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str) @@ -611,23 +744,27 @@ __setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup); static __net_init int sysctl_core_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(netns_core_table); struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { + int i; tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; - tbl[0].data = &net->core.sysctl_somaxconn; + for (i = 0; i < table_size; ++i) { + if (tbl[i].data == &sysctl_wmem_max) + break; - /* Don't export any sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) { - tbl[0].procname = NULL; + tbl[i].data += (char *)net - (char *)&init_net; } + for (; i < table_size; ++i) + tbl[i].mode &= ~0222; } - net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); + net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size); if (net->core.sysctl_hdr == NULL) goto err_reg; @@ -642,11 +779,14 @@ err_dup: static __net_exit void sysctl_core_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); BUG_ON(tbl == netns_core_table); +#if IS_ENABLED(CONFIG_RPS) + kfree(net->core.rps_default_mask); +#endif kfree(tbl); } |
