diff options
Diffstat (limited to 'kernel')
116 files changed, 7484 insertions, 3593 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 3c13240dfc9f..87866b037fbe 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -116,7 +116,6 @@ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o obj-$(CONFIG_CFI_CLANG) += cfi.o -obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/audit.c b/kernel/audit.c index e7a62ebbf4d1..1edaa4846a47 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1612,7 +1612,7 @@ static void audit_log_multicast(int group, const char *op, int err) cred = current_cred(); tty = audit_get_tty(); audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u", - task_pid_nr(current), + task_tgid_nr(current), from_kuid(&init_user_ns, cred->uid), from_kuid(&init_user_ns, audit_get_loginuid(current)), tty ? tty_name(tty) : "(none)", @@ -1706,7 +1706,7 @@ static int __init audit_init(void) audit_cmd_mutex.owner = NULL; pr_info("initializing netlink subsys (%s)\n", - audit_default ? "enabled" : "disabled"); + str_enabled_disabled(audit_default)); register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d6ef4f4f9cba..470041c49a44 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1344,7 +1344,7 @@ int audit_filter(int msgtype, unsigned int listtype) switch (f->type) { case AUDIT_PID: - pid = task_pid_nr(current); + pid = task_tgid_nr(current); result = audit_comparator(pid, f->op, f->val); break; case AUDIT_UID: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 6f0d6fb6523f..cd57053b4a69 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2933,7 +2933,7 @@ void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, audit_log_format(ab, "table=%s family=%u entries=%u op=%s", name, af, nentries, audit_nfcfgs[op].s); - audit_log_format(ab, " pid=%u", task_pid_nr(current)); + audit_log_format(ab, " pid=%u", task_tgid_nr(current)); audit_log_task_context(ab); /* subj= */ audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, get_task_comm(comm, current)); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 520f49f422fe..ba91be08763a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -823,9 +823,11 @@ static bool btf_name_valid_section(const struct btf *btf, u32 offset) const char *src = btf_str_by_offset(btf, offset); const char *src_limit; + if (!*src) + return false; + /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; - src++; while (*src && src < src_limit) { if (!isprint(*src)) return false; @@ -6283,7 +6285,7 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, errout: btf_verifier_env_free(env); - if (base_btf != vmlinux_btf) + if (!IS_ERR(base_btf) && base_btf != vmlinux_btf) btf_free(base_btf); if (btf) { kvfree(btf->data); @@ -6523,6 +6525,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; + if (btf_param_match_suffix(btf, &args[arg], "__nullable")) + info->reg_type |= PTR_MAYBE_NULL; + if (tgt_prog) { enum bpf_prog_type tgt_type; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index fbdf5a1aabfe..a2f46785ac3b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -354,12 +354,14 @@ static int cpu_map_kthread_run(void *data) list_add_tail(&skb->list, &list); } - netif_receive_skb_list(&list); - /* Feedback loop via tracepoint */ + /* Feedback loop via tracepoint. + * NB: keep before recv to allow measuring enqueue/dequeue latency. + */ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops, sched, &stats); + netif_receive_skb_list(&list); local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d8520095ca03..39d5710c68ad 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -28,6 +28,8 @@ #include <linux/cpumask.h> #include <linux/bpf_mem_alloc.h> #include <net/xdp.h> +#include <linux/trace_events.h> +#include <linux/kallsyms.h> #include "disasm.h" @@ -21154,11 +21156,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, { bool prog_extension = prog->type == BPF_PROG_TYPE_EXT; bool prog_tracing = prog->type == BPF_PROG_TYPE_TRACING; + char trace_symbol[KSYM_SYMBOL_LEN]; const char prefix[] = "btf_trace_"; + struct bpf_raw_event_map *btp; int ret = 0, subprog = -1, i; const struct btf_type *t; bool conservative = true; - const char *tname; + const char *tname, *fname; struct btf *btf; long addr = 0; struct module *mod = NULL; @@ -21289,10 +21293,34 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, return -EINVAL; } tname += sizeof(prefix) - 1; - t = btf_type_by_id(btf, t->type); - if (!btf_type_is_ptr(t)) - /* should never happen in valid vmlinux build */ + + /* The func_proto of "btf_trace_##tname" is generated from typedef without argument + * names. Thus using bpf_raw_event_map to get argument names. + */ + btp = bpf_get_raw_tracepoint(tname); + if (!btp) return -EINVAL; + fname = kallsyms_lookup((unsigned long)btp->bpf_func, NULL, NULL, NULL, + trace_symbol); + bpf_put_raw_tracepoint(btp); + + if (fname) + ret = btf_find_by_name_kind(btf, fname, BTF_KIND_FUNC); + + if (!fname || ret < 0) { + bpf_log(log, "Cannot find btf of tracepoint template, fall back to %s%s.\n", + prefix, tname); + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_ptr(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + } else { + t = btf_type_by_id(btf, ret); + if (!btf_type_is_func(t)) + /* should never happen in valid vmlinux build */ + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); if (!btf_type_is_func_proto(t)) /* should never happen in valid vmlinux build */ diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 12f8457ad1f9..a5c9359d516f 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o obj-$(CONFIG_CGROUP_MISC) += misc.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 520b90dd97ec..c964dd7ff967 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -81,6 +81,8 @@ struct cgroup_file_ctx { struct { struct cgroup_pidlist *pidlist; } procs1; + + struct cgroup_of_peak peak; }; /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index b9dbf6bf2779..e28d5f0d20ed 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -46,6 +46,12 @@ bool cgroup1_ssid_disabled(int ssid) return cgroup_no_v1_mask & (1 << ssid); } +static bool cgroup1_subsys_absent(struct cgroup_subsys *ss) +{ + /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */ + return ss->legacy_cftypes == NULL && ss->dfl_cftypes; +} + /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task @@ -675,11 +681,14 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) * cgroup_mutex contention. */ - for_each_subsys(ss, i) + for_each_subsys(ss, i) { + if (cgroup1_subsys_absent(ss)) + continue; seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); + } return 0; } @@ -932,7 +941,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { - if (strcmp(param->key, ss->legacy_name)) + if (strcmp(param->key, ss->legacy_name) || + cgroup1_subsys_absent(ss)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", @@ -1024,7 +1034,8 @@ static int check_cgroupfs_options(struct fs_context *fc) mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && + !cgroup1_subsys_absent(ss)) enabled |= 1 << i; ctx->subsys_mask &= enabled; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c8e4b62b436a..f38f61bc068a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1972,6 +1972,13 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param return -EINVAL; } +struct cgroup_of_peak *of_peak(struct kernfs_open_file *of) +{ + struct cgroup_file_ctx *ctx = of->priv; + + return &ctx->peak; +} + static void apply_cgroup_root_flags(unsigned int root_flags) { if (current->nsproxy->cgroup_ns == &init_cgroup_ns) { @@ -2331,7 +2338,7 @@ static struct file_system_type cgroup2_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, @@ -3669,12 +3676,40 @@ static int cgroup_events_show(struct seq_file *seq, void *v) static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; + struct cgroup_subsys_state *css; + int dying_cnt[CGROUP_SUBSYS_COUNT]; + int ssid; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); + + /* + * Show the number of live and dying csses associated with each of + * non-inhibited cgroup subsystems that is bound to cgroup v2. + * + * Without proper lock protection, racing is possible. So the + * numbers may not be consistent when that happens. + */ + rcu_read_lock(); + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + dying_cnt[ssid] = -1; + if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) || + (cgroup_subsys[ssid]->root != &cgrp_dfl_root)) + continue; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid]; + seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name, + css ? (css->nr_descendants + 1) : 0); + } + seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); - + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + if (dying_cnt[ssid] >= 0) + seq_printf(seq, "nr_dying_subsys_%s %d\n", + cgroup_subsys[ssid]->name, dying_cnt[ssid]); + } + rcu_read_unlock(); return 0; } @@ -4096,7 +4131,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, * If namespaces are delegation boundaries, disallow writes to * files in an non-init namespace root from inside the namespace * except for the files explicitly marked delegatable - - * cgroup.procs and cgroup.subtree_control. + * eg. cgroup.procs, cgroup.threads and cgroup.subtree_control. */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && @@ -4595,8 +4630,9 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct next descendant as long - * as both @pos and @root are accessible and @pos is a descendant of @root. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct next descendant as long as both @pos + * and @root are accessible and @pos is a descendant of @root. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the @@ -4644,8 +4680,9 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct rightmost descendant as - * long as @pos is accessible. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct rightmost descendant as long as @pos + * is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) @@ -4689,9 +4726,9 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * * While this function requires cgroup_mutex or RCU read locking, it * doesn't require the whole traversal to be contained in a single critical - * section. This function will return the correct next descendant as long - * as both @pos and @cgroup are accessible and @pos is a descendant of - * @cgroup. + * section. Additionally, it isn't necessary to hold onto a reference to @pos. + * This function will return the correct next descendant as long as both @pos + * and @cgroup are accessible and @pos is a descendant of @cgroup. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the @@ -5424,6 +5461,8 @@ static void css_release_work_fn(struct work_struct *work) list_del_rcu(&css->sibling); if (ss) { + struct cgroup *parent_cgrp; + /* css release path */ if (!list_empty(&css->rstat_css_node)) { cgroup_rstat_flush(cgrp); @@ -5433,6 +5472,21 @@ static void css_release_work_fn(struct work_struct *work) cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); + + cgrp->nr_dying_subsys[ss->id]--; + /* + * When a css is released and ready to be freed, its + * nr_descendants must be zero. However, the corresponding + * cgrp->nr_dying_subsys[ss->id] may not be 0 if a subsystem + * is activated and deactivated multiple times with one or + * more of its previous activation leaving behind dying csses. + */ + WARN_ON_ONCE(css->nr_descendants); + parent_cgrp = cgroup_parent(cgrp); + while (parent_cgrp) { + parent_cgrp->nr_dying_subsys[ss->id]--; + parent_cgrp = cgroup_parent(parent_cgrp); + } } else { struct cgroup *tcgrp; @@ -5517,8 +5571,11 @@ static int online_css(struct cgroup_subsys_state *css) rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); - if (css->parent) + if (css->parent) { atomic_inc(&css->parent->online_cnt); + while ((css = css->parent)) + css->nr_descendants++; + } } return ret; } @@ -5540,6 +5597,16 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); + + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } } /** @@ -6178,7 +6245,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); -#ifdef CONFIG_CPUSETS +#ifdef CONFIG_CPUSETS_V1 WARN_ON(register_filesystem(&cpuset_fs_type)); #endif diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h new file mode 100644 index 000000000000..976a8bc3ff60 --- /dev/null +++ b/kernel/cgroup/cpuset-internal.h @@ -0,0 +1,305 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __CPUSET_INTERNAL_H +#define __CPUSET_INTERNAL_H + +#include <linux/cgroup.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/cpuset.h> +#include <linux/spinlock.h> +#include <linux/union_find.h> + +/* See "Frequency meter" comments, below. */ + +struct fmeter { + int cnt; /* unprocessed events count */ + int val; /* most recent output value */ + time64_t time; /* clock (secs) when val computed */ + spinlock_t lock; /* guards read or write of above */ +}; + +/* + * Invalid partition error code + */ +enum prs_errcode { + PERR_NONE = 0, + PERR_INVCPUS, + PERR_INVPARENT, + PERR_NOTPART, + PERR_NOTEXCL, + PERR_NOCPUS, + PERR_HOTPLUG, + PERR_CPUSEMPTY, + PERR_HKEEPING, + PERR_ACCESS, +}; + +/* bits in struct cpuset flags field */ +typedef enum { + CS_ONLINE, + CS_CPU_EXCLUSIVE, + CS_MEM_EXCLUSIVE, + CS_MEM_HARDWALL, + CS_MEMORY_MIGRATE, + CS_SCHED_LOAD_BALANCE, + CS_SPREAD_PAGE, + CS_SPREAD_SLAB, +} cpuset_flagbits_t; + +/* The various types of files and directories in a cpuset file system */ + +typedef enum { + FILE_MEMORY_MIGRATE, + FILE_CPULIST, + FILE_MEMLIST, + FILE_EFFECTIVE_CPULIST, + FILE_EFFECTIVE_MEMLIST, + FILE_SUBPARTS_CPULIST, + FILE_EXCLUSIVE_CPULIST, + FILE_EFFECTIVE_XCPULIST, + FILE_ISOLATED_CPULIST, + FILE_CPU_EXCLUSIVE, + FILE_MEM_EXCLUSIVE, + FILE_MEM_HARDWALL, + FILE_SCHED_LOAD_BALANCE, + FILE_PARTITION_ROOT, + FILE_SCHED_RELAX_DOMAIN_LEVEL, + FILE_MEMORY_PRESSURE_ENABLED, + FILE_MEMORY_PRESSURE, + FILE_SPREAD_PAGE, + FILE_SPREAD_SLAB, +} cpuset_filetype_t; + +struct cpuset { + struct cgroup_subsys_state css; + + unsigned long flags; /* "unsigned long" so bitops work */ + + /* + * On default hierarchy: + * + * The user-configured masks can only be changed by writing to + * cpuset.cpus and cpuset.mems, and won't be limited by the + * parent masks. + * + * The effective masks is the real masks that apply to the tasks + * in the cpuset. They may be changed if the configured masks are + * changed or hotplug happens. + * + * effective_mask == configured_mask & parent's effective_mask, + * and if it ends up empty, it will inherit the parent's mask. + * + * + * On legacy hierarchy: + * + * The user-configured masks are always the same with effective masks. + */ + + /* user-configured CPUs and Memory Nodes allow to tasks */ + cpumask_var_t cpus_allowed; + nodemask_t mems_allowed; + + /* effective CPUs and Memory Nodes allow to tasks */ + cpumask_var_t effective_cpus; + nodemask_t effective_mems; + + /* + * Exclusive CPUs dedicated to current cgroup (default hierarchy only) + * + * The effective_cpus of a valid partition root comes solely from its + * effective_xcpus and some of the effective_xcpus may be distributed + * to sub-partitions below & hence excluded from its effective_cpus. + * For a valid partition root, its effective_cpus have no relationship + * with cpus_allowed unless its exclusive_cpus isn't set. + * + * This value will only be set if either exclusive_cpus is set or + * when this cpuset becomes a local partition root. + */ + cpumask_var_t effective_xcpus; + + /* + * Exclusive CPUs as requested by the user (default hierarchy only) + * + * Its value is independent of cpus_allowed and designates the set of + * CPUs that can be granted to the current cpuset or its children when + * it becomes a valid partition root. The effective set of exclusive + * CPUs granted (effective_xcpus) depends on whether those exclusive + * CPUs are passed down by its ancestors and not yet taken up by + * another sibling partition root along the way. + * + * If its value isn't set, it defaults to cpus_allowed. + */ + cpumask_var_t exclusive_cpus; + + /* + * This is old Memory Nodes tasks took on. + * + * - top_cpuset.old_mems_allowed is initialized to mems_allowed. + * - A new cpuset's old_mems_allowed is initialized when some + * task is moved into it. + * - old_mems_allowed is used in cpuset_migrate_mm() when we change + * cpuset.mems_allowed and have tasks' nodemask updated, and + * then old_mems_allowed is updated to mems_allowed. + */ + nodemask_t old_mems_allowed; + + struct fmeter fmeter; /* memory_pressure filter */ + + /* + * Tasks are being attached to this cpuset. Used to prevent + * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). + */ + int attach_in_progress; + + /* for custom sched domain */ + int relax_domain_level; + + /* number of valid local child partitions */ + int nr_subparts; + + /* partition root state */ + int partition_root_state; + + /* + * number of SCHED_DEADLINE tasks attached to this cpuset, so that we + * know when to rebuild associated root domain bandwidth information. + */ + int nr_deadline_tasks; + int nr_migrate_dl_tasks; + u64 sum_migrate_dl_bw; + + /* Invalid partition error code, not lock protected */ + enum prs_errcode prs_err; + + /* Handle for cpuset.cpus.partition */ + struct cgroup_file partition_file; + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; + + /* Used to merge intersecting subsets for generate_sched_domains */ + struct uf_node node; +}; + +static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuset, css) : NULL; +} + +/* Retrieve the cpuset for a task */ +static inline struct cpuset *task_cs(struct task_struct *task) +{ + return css_cs(task_css(task, cpuset_cgrp_id)); +} + +static inline struct cpuset *parent_cs(struct cpuset *cs) +{ + return css_cs(cs->css.parent); +} + +/* convenient tests for these bits */ +static inline bool is_cpuset_online(struct cpuset *cs) +{ + return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); +} + +static inline int is_cpu_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_hardwall(const struct cpuset *cs) +{ + return test_bit(CS_MEM_HARDWALL, &cs->flags); +} + +static inline int is_sched_load_balance(const struct cpuset *cs) +{ + return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); +} + +static inline int is_memory_migrate(const struct cpuset *cs) +{ + return test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + +static inline int is_spread_page(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_PAGE, &cs->flags); +} + +static inline int is_spread_slab(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_SLAB, &cs->flags); +} + +/** + * cpuset_for_each_child - traverse online children of a cpuset + * @child_cs: loop cursor pointing to the current child + * @pos_css: used for iteration + * @parent_cs: target cpuset to walk children of + * + * Walk @child_cs through the online children of @parent_cs. Must be used + * with RCU read locked. + */ +#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ + css_for_each_child((pos_css), &(parent_cs)->css) \ + if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) + +/** + * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants + * @des_cs: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @root_cs: target cpuset to walk ancestor of + * + * Walk @des_cs through the online descendants of @root_cs. Must be used + * with RCU read locked. The caller may modify @pos_css by calling + * css_rightmost_descendant() to skip subtree. @root_cs is included in the + * iteration and the first node to be visited. + */ +#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ + css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) + +void rebuild_sched_domains_locked(void); +void cpuset_callback_lock_irq(void); +void cpuset_callback_unlock_irq(void); +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus); +void cpuset_update_tasks_nodemask(struct cpuset *cs); +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int cpuset_common_seq_show(struct seq_file *sf, void *v); + +/* + * cpuset-v1.c + */ +#ifdef CONFIG_CPUSETS_V1 +extern struct cftype cpuset1_files[]; +void fmeter_init(struct fmeter *fmp); +void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk); +void cpuset1_update_tasks_flags(struct cpuset *cs); +void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated); +int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); +#else +static inline void fmeter_init(struct fmeter *fmp) {} +static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk) {} +static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} +static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) {} +static inline int cpuset1_validate_change(struct cpuset *cur, + struct cpuset *trial) { return 0; } +#endif /* CONFIG_CPUSETS_V1 */ + +#endif /* __CPUSET_INTERNAL_H */ diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c new file mode 100644 index 000000000000..25c1d7b77e2f --- /dev/null +++ b/kernel/cgroup/cpuset-v1.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "cpuset-internal.h" + +/* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { + struct work_struct work; + struct cpuset *cs; +}; + +/* + * Frequency meter - How fast is some event occurring? + * + * These routines manage a digitally filtered, constant time based, + * event frequency meter. There are four routines: + * fmeter_init() - initialize a frequency meter. + * fmeter_markevent() - called each time the event happens. + * fmeter_getrate() - returns the recent rate of such events. + * fmeter_update() - internal routine used to update fmeter. + * + * A common data structure is passed to each of these routines, + * which is used to keep track of the state required to manage the + * frequency meter and its digital filter. + * + * The filter works on the number of events marked per unit time. + * The filter is single-pole low-pass recursive (IIR). The time unit + * is 1 second. Arithmetic is done using 32-bit integers scaled to + * simulate 3 decimal digits of precision (multiplied by 1000). + * + * With an FM_COEF of 933, and a time base of 1 second, the filter + * has a half-life of 10 seconds, meaning that if the events quit + * happening, then the rate returned from the fmeter_getrate() + * will be cut in half each 10 seconds, until it converges to zero. + * + * It is not worth doing a real infinitely recursive filter. If more + * than FM_MAXTICKS ticks have elapsed since the last filter event, + * just compute FM_MAXTICKS ticks worth, by which point the level + * will be stable. + * + * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid + * arithmetic overflow in the fmeter_update() routine. + * + * Given the simple 32 bit integer arithmetic used, this meter works + * best for reporting rates between one per millisecond (msec) and + * one per 32 (approx) seconds. At constant rates faster than one + * per msec it maxes out at values just under 1,000,000. At constant + * rates between one per msec, and one per second it will stabilize + * to a value N*1000, where N is the rate of events per second. + * At constant rates between one per second and one per 32 seconds, + * it will be choppy, moving up on the seconds that have an event, + * and then decaying until the next event. At rates slower than + * about one in 32 seconds, it decays all the way back to zero between + * each event. + */ + +#define FM_COEF 933 /* coefficient for half-life of 10 secs */ +#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ +#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ +#define FM_SCALE 1000 /* faux fixed point scale */ + +/* Initialize a frequency meter */ +void fmeter_init(struct fmeter *fmp) +{ + fmp->cnt = 0; + fmp->val = 0; + fmp->time = 0; + spin_lock_init(&fmp->lock); +} + +/* Internal meter update - process cnt events and update value */ +static void fmeter_update(struct fmeter *fmp) +{ + time64_t now; + u32 ticks; + + now = ktime_get_seconds(); + ticks = now - fmp->time; + + if (ticks == 0) + return; + + ticks = min(FM_MAXTICKS, ticks); + while (ticks-- > 0) + fmp->val = (FM_COEF * fmp->val) / FM_SCALE; + fmp->time = now; + + fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; + fmp->cnt = 0; +} + +/* Process any previous ticks, then bump cnt by one (times scale). */ +static void fmeter_markevent(struct fmeter *fmp) +{ + spin_lock(&fmp->lock); + fmeter_update(fmp); + fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); + spin_unlock(&fmp->lock); +} + +/* Process any previous ticks, then return current value. */ +static int fmeter_getrate(struct fmeter *fmp) +{ + int val; + + spin_lock(&fmp->lock); + fmeter_update(fmp); + val = fmp->val; + spin_unlock(&fmp->lock); + return val; +} + +/* + * Collection of memory_pressure is suppressed unless + * this flag is enabled by writing "1" to the special + * cpuset file 'memory_pressure_enabled' in the root cpuset. + */ + +int cpuset_memory_pressure_enabled __read_mostly; + +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. + * + * Keep a running average of the rate of synchronous (direct) + * page reclaim efforts initiated by tasks in each cpuset. + * + * This represents the rate at which some task in the cpuset + * ran low on memory on all nodes it was allowed to use, and + * had to enter the kernels page reclaim code in an effort to + * create more free memory by tossing clean pages or swapping + * or writing dirty pages. + * + * Display to user space in the per-cpuset read-only file + * "memory_pressure". Value displayed is an integer + * representing the recent rate of entry into the synchronous + * (direct) page reclaim by any task attached to the cpuset. + */ + +void __cpuset_memory_pressure_bump(void) +{ + rcu_read_lock(); + fmeter_markevent(&task_cs(current)->fmeter); + rcu_read_unlock(); +} + +static int update_relax_domain_level(struct cpuset *cs, s64 val) +{ +#ifdef CONFIG_SMP + if (val < -1 || val > sched_domain_level_max + 1) + return -EINVAL; +#endif + + if (val != cs->relax_domain_level) { + cs->relax_domain_level = val; + if (!cpumask_empty(cs->cpus_allowed) && + is_sched_load_balance(cs)) + rebuild_sched_domains_locked(); + } + + return 0; +} + +static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + int retval = -ENODEV; + + cpus_read_lock(); + cpuset_lock(); + if (!is_cpuset_online(cs)) + goto out_unlock; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + retval = update_relax_domain_level(cs, val); + break; + default: + retval = -EINVAL; + break; + } +out_unlock: + cpuset_unlock(); + cpus_read_unlock(); + return retval; +} + +static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + return cs->relax_domain_level; + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} + +/* + * update task's spread flag if cpuset's page/slab spread flag is set + * + * Call with callback_lock or cpuset_mutex held. The check can be skipped + * if on default hierarchy. + */ +void cpuset1_update_task_spread_flags(struct cpuset *cs, + struct task_struct *tsk) +{ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + + if (is_spread_page(cs)) + task_set_spread_page(tsk); + else + task_clear_spread_page(tsk); + + if (is_spread_slab(cs)) + task_set_spread_slab(tsk); + else + task_clear_spread_slab(tsk); +} + +/** + * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset. + * @cs: the cpuset in which each task's spread flags needs to be changed + * + * Iterate through each task of @cs updating its spread flags. As this + * function is called with cpuset_mutex held, cpuset membership stays + * stable. + */ +void cpuset1_update_tasks_flags(struct cpuset *cs) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, 0, &it); + while ((task = css_task_iter_next(&it))) + cpuset1_update_task_spread_flags(cs, task); + css_task_iter_end(&it); +} + +/* + * If CPU and/or memory hotplug handlers, below, unplug any CPUs + * or memory nodes, we need to walk over the cpuset hierarchy, + * removing that CPU or node from all cpusets. If this removes the + * last CPU or node from a cpuset, then move the tasks in the empty + * cpuset to its next-highest non-empty parent. + */ +static void remove_tasks_in_empty_cpuset(struct cpuset *cs) +{ + struct cpuset *parent; + + /* + * Find its next-highest non-empty parent, (top cpuset + * has online cpus, so can't be empty). + */ + parent = parent_cs(cs); + while (cpumask_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) + parent = parent_cs(parent); + + if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { + pr_err("cpuset: failed to transfer tasks out of empty cpuset "); + pr_cont_cgroup_name(cs->css.cgroup); + pr_cont("\n"); + } +} + +static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ + struct cpuset_remove_tasks_struct *s; + + s = container_of(work, struct cpuset_remove_tasks_struct, work); + remove_tasks_in_empty_cpuset(s->cs); + css_put(&s->cs->css); + kfree(s); +} + +void cpuset1_hotplug_update_tasks(struct cpuset *cs, + struct cpumask *new_cpus, nodemask_t *new_mems, + bool cpus_updated, bool mems_updated) +{ + bool is_empty; + + cpuset_callback_lock_irq(); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; + cpuset_callback_unlock_irq(); + + /* + * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty, + * as the tasks will be migrated to an ancestor. + */ + if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) + cpuset_update_tasks_cpumask(cs, new_cpus); + if (mems_updated && !nodes_empty(cs->mems_allowed)) + cpuset_update_tasks_nodemask(cs); + + is_empty = cpumask_empty(cs->cpus_allowed) || + nodes_empty(cs->mems_allowed); + + /* + * Move tasks to the nearest ancestor with execution resources, + * This is full cgroup operation which will also call back into + * cpuset. Execute it asynchronously using workqueue. + */ + if (is_empty && cs->css.cgroup->nr_populated_csets && + css_tryget_online(&cs->css)) { + struct cpuset_remove_tasks_struct *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (WARN_ON_ONCE(!s)) { + css_put(&cs->css); + return; + } + + s->cs = cs; + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); + schedule_work(&s->work); + } +} + +/* + * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? + * + * One cpuset is a subset of another if all its allowed CPUs and + * Memory Nodes are a subset of the other, and its exclusive flags + * are only set if the other's are set. Call holding cpuset_mutex. + */ + +static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) +{ + return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + nodes_subset(p->mems_allowed, q->mems_allowed) && + is_cpu_exclusive(p) <= is_cpu_exclusive(q) && + is_mem_exclusive(p) <= is_mem_exclusive(q); +} + +/* + * cpuset1_validate_change() - Validate conditions specific to legacy (v1) + * behavior. + */ +int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + +static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + + switch (type) { + case FILE_CPU_EXCLUSIVE: + return is_cpu_exclusive(cs); + case FILE_MEM_EXCLUSIVE: + return is_mem_exclusive(cs); + case FILE_MEM_HARDWALL: + return is_mem_hardwall(cs); + case FILE_SCHED_LOAD_BALANCE: + return is_sched_load_balance(cs); + case FILE_MEMORY_MIGRATE: + return is_memory_migrate(cs); + case FILE_MEMORY_PRESSURE_ENABLED: + return cpuset_memory_pressure_enabled; + case FILE_MEMORY_PRESSURE: + return fmeter_getrate(&cs->fmeter); + case FILE_SPREAD_PAGE: + return is_spread_page(cs); + case FILE_SPREAD_SLAB: + return is_spread_slab(cs); + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} + +static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) +{ + struct cpuset *cs = css_cs(css); + cpuset_filetype_t type = cft->private; + int retval = 0; + + cpus_read_lock(); + cpuset_lock(); + if (!is_cpuset_online(cs)) { + retval = -ENODEV; + goto out_unlock; + } + + switch (type) { + case FILE_CPU_EXCLUSIVE: + retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); + break; + case FILE_MEM_EXCLUSIVE: + retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); + break; + case FILE_MEM_HARDWALL: + retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); + break; + case FILE_SCHED_LOAD_BALANCE: + retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); + break; + case FILE_MEMORY_MIGRATE: + retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); + break; + case FILE_MEMORY_PRESSURE_ENABLED: + cpuset_memory_pressure_enabled = !!val; + break; + case FILE_SPREAD_PAGE: + retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); + break; + case FILE_SPREAD_SLAB: + retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); + break; + default: + retval = -EINVAL; + break; + } +out_unlock: + cpuset_unlock(); + cpus_read_unlock(); + return retval; +} + +/* + * for the common functions, 'private' gives the type of file + */ + +struct cftype cpuset1_files[] = { + { + .name = "cpus", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_CPULIST, + }, + + { + .name = "mems", + .seq_show = cpuset_common_seq_show, + .write = cpuset_write_resmask, + .max_write_len = (100U + 6 * MAX_NUMNODES), + .private = FILE_MEMLIST, + }, + + { + .name = "effective_cpus", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_CPULIST, + }, + + { + .name = "effective_mems", + .seq_show = cpuset_common_seq_show, + .private = FILE_EFFECTIVE_MEMLIST, + }, + + { + .name = "cpu_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_CPU_EXCLUSIVE, + }, + + { + .name = "mem_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_EXCLUSIVE, + }, + + { + .name = "mem_hardwall", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_HARDWALL, + }, + + { + .name = "sched_load_balance", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_LOAD_BALANCE, + }, + + { + .name = "sched_relax_domain_level", + .read_s64 = cpuset_read_s64, + .write_s64 = cpuset_write_s64, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, + }, + + { + .name = "memory_migrate", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_MIGRATE, + }, + + { + .name = "memory_pressure", + .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, + }, + + { + .name = "memory_spread_page", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_PAGE, + }, + + { + /* obsolete, may be removed in the future */ + .name = "memory_spread_slab", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_SLAB, + }, + + { + .name = "memory_pressure_enabled", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_PRESSURE_ENABLED, + }, + + { } /* terminate */ +}; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4bd9e50bcc8e..a4dd285cdf39 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -22,11 +22,8 @@ * distribution for more details. */ #include "cgroup-internal.h" +#include "cpuset-internal.h" -#include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/cpuset.h> -#include <linux/delay.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> @@ -40,10 +37,8 @@ #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/security.h> -#include <linux/spinlock.h> #include <linux/oom.h> #include <linux/sched/isolation.h> -#include <linux/cgroup.h> #include <linux/wait.h> #include <linux/workqueue.h> @@ -57,30 +52,6 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); -/* See "Frequency meter" comments, below. */ - -struct fmeter { - int cnt; /* unprocessed events count */ - int val; /* most recent output value */ - time64_t time; /* clock (secs) when val computed */ - spinlock_t lock; /* guards read or write of above */ -}; - -/* - * Invalid partition error code - */ -enum prs_errcode { - PERR_NONE = 0, - PERR_INVCPUS, - PERR_INVPARENT, - PERR_NOTPART, - PERR_NOTEXCL, - PERR_NOCPUS, - PERR_HOTPLUG, - PERR_CPUSEMPTY, - PERR_HKEEPING, -}; - static const char * const perr_strings[] = { [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive", [PERR_INVPARENT] = "Parent is an invalid partition root", @@ -90,133 +61,7 @@ static const char * const perr_strings[] = { [PERR_HOTPLUG] = "No cpu available due to hotplug", [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", -}; - -struct cpuset { - struct cgroup_subsys_state css; - - unsigned long flags; /* "unsigned long" so bitops work */ - - /* - * On default hierarchy: - * - * The user-configured masks can only be changed by writing to - * cpuset.cpus and cpuset.mems, and won't be limited by the - * parent masks. - * - * The effective masks is the real masks that apply to the tasks - * in the cpuset. They may be changed if the configured masks are - * changed or hotplug happens. - * - * effective_mask == configured_mask & parent's effective_mask, - * and if it ends up empty, it will inherit the parent's mask. - * - * - * On legacy hierarchy: - * - * The user-configured masks are always the same with effective masks. - */ - - /* user-configured CPUs and Memory Nodes allow to tasks */ - cpumask_var_t cpus_allowed; - nodemask_t mems_allowed; - - /* effective CPUs and Memory Nodes allow to tasks */ - cpumask_var_t effective_cpus; - nodemask_t effective_mems; - - /* - * Exclusive CPUs dedicated to current cgroup (default hierarchy only) - * - * The effective_cpus of a valid partition root comes solely from its - * effective_xcpus and some of the effective_xcpus may be distributed - * to sub-partitions below & hence excluded from its effective_cpus. - * For a valid partition root, its effective_cpus have no relationship - * with cpus_allowed unless its exclusive_cpus isn't set. - * - * This value will only be set if either exclusive_cpus is set or - * when this cpuset becomes a local partition root. - */ - cpumask_var_t effective_xcpus; - - /* - * Exclusive CPUs as requested by the user (default hierarchy only) - * - * Its value is independent of cpus_allowed and designates the set of - * CPUs that can be granted to the current cpuset or its children when - * it becomes a valid partition root. The effective set of exclusive - * CPUs granted (effective_xcpus) depends on whether those exclusive - * CPUs are passed down by its ancestors and not yet taken up by - * another sibling partition root along the way. - * - * If its value isn't set, it defaults to cpus_allowed. - */ - cpumask_var_t exclusive_cpus; - - /* - * This is old Memory Nodes tasks took on. - * - * - top_cpuset.old_mems_allowed is initialized to mems_allowed. - * - A new cpuset's old_mems_allowed is initialized when some - * task is moved into it. - * - old_mems_allowed is used in cpuset_migrate_mm() when we change - * cpuset.mems_allowed and have tasks' nodemask updated, and - * then old_mems_allowed is updated to mems_allowed. - */ - nodemask_t old_mems_allowed; - - struct fmeter fmeter; /* memory_pressure filter */ - - /* - * Tasks are being attached to this cpuset. Used to prevent - * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). - */ - int attach_in_progress; - - /* partition number for rebuild_sched_domains() */ - int pn; - - /* for custom sched domain */ - int relax_domain_level; - - /* number of valid local child partitions */ - int nr_subparts; - - /* partition root state */ - int partition_root_state; - - /* - * Default hierarchy only: - * use_parent_ecpus - set if using parent's effective_cpus - * child_ecpus_count - # of children with use_parent_ecpus set - */ - int use_parent_ecpus; - int child_ecpus_count; - - /* - * number of SCHED_DEADLINE tasks attached to this cpuset, so that we - * know when to rebuild associated root domain bandwidth information. - */ - int nr_deadline_tasks; - int nr_migrate_dl_tasks; - u64 sum_migrate_dl_bw; - - /* Invalid partition error code, not lock protected */ - enum prs_errcode prs_err; - - /* Handle for cpuset.cpus.partition */ - struct cgroup_file partition_file; - - /* Remote partition silbling list anchored at remote_children */ - struct list_head remote_sibling; -}; - -/* - * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously - */ -struct cpuset_remove_tasks_struct { - struct work_struct work; - struct cpuset *cs; + [PERR_ACCESS] = "Enable partition not permitted", }; /* @@ -229,6 +74,12 @@ static cpumask_var_t subpartitions_cpus; */ static cpumask_var_t isolated_cpus; +/* + * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot + */ +static cpumask_var_t boot_hk_cpus; +static bool have_boot_isolcpus; + /* List of remote partition root children */ static struct list_head remote_children; @@ -279,22 +130,6 @@ struct tmpmasks { cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ }; -static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct cpuset, css) : NULL; -} - -/* Retrieve the cpuset for a task */ -static inline struct cpuset *task_cs(struct task_struct *task) -{ - return css_cs(task_css(task, cpuset_cgrp_id)); -} - -static inline struct cpuset *parent_cs(struct cpuset *cs) -{ - return css_cs(cs->css.parent); -} - void inc_dl_tasks_cs(struct task_struct *p) { struct cpuset *cs = task_cs(p); @@ -309,59 +144,6 @@ void dec_dl_tasks_cs(struct task_struct *p) cs->nr_deadline_tasks--; } -/* bits in struct cpuset flags field */ -typedef enum { - CS_ONLINE, - CS_CPU_EXCLUSIVE, - CS_MEM_EXCLUSIVE, - CS_MEM_HARDWALL, - CS_MEMORY_MIGRATE, - CS_SCHED_LOAD_BALANCE, - CS_SPREAD_PAGE, - CS_SPREAD_SLAB, -} cpuset_flagbits_t; - -/* convenient tests for these bits */ -static inline bool is_cpuset_online(struct cpuset *cs) -{ - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); -} - -static inline int is_cpu_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_exclusive(const struct cpuset *cs) -{ - return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); -} - -static inline int is_mem_hardwall(const struct cpuset *cs) -{ - return test_bit(CS_MEM_HARDWALL, &cs->flags); -} - -static inline int is_sched_load_balance(const struct cpuset *cs) -{ - return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); -} - -static inline int is_memory_migrate(const struct cpuset *cs) -{ - return test_bit(CS_MEMORY_MIGRATE, &cs->flags); -} - -static inline int is_spread_page(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_PAGE, &cs->flags); -} - -static inline int is_spread_slab(const struct cpuset *cs) -{ - return test_bit(CS_SPREAD_SLAB, &cs->flags); -} - static inline int is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; @@ -403,34 +185,6 @@ static struct cpuset top_cpuset = { .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; -/** - * cpuset_for_each_child - traverse online children of a cpuset - * @child_cs: loop cursor pointing to the current child - * @pos_css: used for iteration - * @parent_cs: target cpuset to walk children of - * - * Walk @child_cs through the online children of @parent_cs. Must be used - * with RCU read locked. - */ -#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ - css_for_each_child((pos_css), &(parent_cs)->css) \ - if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) - -/** - * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants - * @des_cs: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @root_cs: target cpuset to walk ancestor of - * - * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_css by calling - * css_rightmost_descendant() to skip subtree. @root_cs is included in the - * iteration and the first node to be visited. - */ -#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ - css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ - if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) - /* * There are two global locks guarding cpuset structures - cpuset_mutex and * callback_lock. We also require taking task_lock() when dereferencing a @@ -484,6 +238,16 @@ void cpuset_unlock(void) static DEFINE_SPINLOCK(callback_lock); +void cpuset_callback_lock_irq(void) +{ + spin_lock_irq(&callback_lock); +} + +void cpuset_callback_unlock_irq(void) +{ + spin_unlock_irq(&callback_lock); +} + static struct workqueue_struct *cpuset_migrate_mm_wq; static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); @@ -500,6 +264,26 @@ static inline void check_insane_mems_config(nodemask_t *nodes) } /* + * decrease cs->attach_in_progress. + * wake_up cpuset_attach_wq if cs->attach_in_progress==0. + */ +static inline void dec_attach_in_progress_locked(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + cs->attach_in_progress--; + if (!cs->attach_in_progress) + wake_up(&cpuset_attach_wq); +} + +static inline void dec_attach_in_progress(struct cpuset *cs) +{ + mutex_lock(&cpuset_mutex); + dec_attach_in_progress_locked(cs); + mutex_unlock(&cpuset_mutex); +} + +/* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. @@ -596,45 +380,6 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); } -/* - * update task's spread flag if cpuset's page/slab spread flag is set - * - * Call with callback_lock or cpuset_mutex held. The check can be skipped - * if on default hierarchy. - */ -static void cpuset_update_task_spread_flags(struct cpuset *cs, - struct task_struct *tsk) -{ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) - return; - - if (is_spread_page(cs)) - task_set_spread_page(tsk); - else - task_clear_spread_page(tsk); - - if (is_spread_slab(cs)) - task_set_spread_slab(tsk); - else - task_clear_spread_slab(tsk); -} - -/* - * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? - * - * One cpuset is a subset of another if all its allowed CPUs and - * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. - */ - -static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) -{ - return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && - nodes_subset(p->mems_allowed, q->mems_allowed) && - is_cpu_exclusive(p) <= is_cpu_exclusive(q) && - is_mem_exclusive(p) <= is_mem_exclusive(q); -} - /** * alloc_cpumasks - allocate three cpumasks for cpuset * @cs: the cpuset that have cpumasks to be allocated. @@ -750,13 +495,6 @@ static inline bool xcpus_empty(struct cpuset *cs) cpumask_empty(cs->exclusive_cpus); } -static inline struct cpumask *fetch_xcpus(struct cpuset *cs) -{ - return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : - cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed - : cs->effective_xcpus; -} - /* * cpusets_are_exclusive() - check if two cpusets are exclusive * @@ -764,8 +502,8 @@ static inline struct cpumask *fetch_xcpus(struct cpuset *cs) */ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) { - struct cpumask *xcpus1 = fetch_xcpus(cs1); - struct cpumask *xcpus2 = fetch_xcpus(cs2); + struct cpumask *xcpus1 = user_xcpus(cs1); + struct cpumask *xcpus2 = user_xcpus(cs2); if (cpumask_intersects(xcpus1, xcpus2)) return false; @@ -773,35 +511,6 @@ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) } /* - * validate_change_legacy() - Validate conditions specific to legacy (v1) - * behavior. - */ -static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) -{ - struct cgroup_subsys_state *css; - struct cpuset *c, *par; - int ret; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; - - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - par = parent_cs(cur); - if (par && !is_cpuset_subset(trial, par)) - goto out; - - ret = 0; -out: - return ret; -} - -/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * @@ -830,7 +539,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) rcu_read_lock(); if (!is_in_v2_mode()) - ret = validate_change_legacy(cur, trial); + ret = cpuset1_validate_change(cur, trial); if (ret) goto out; @@ -996,18 +705,15 @@ static inline int nr_cpusets(void) * were changed (added or removed.) * * Finding the best partition (set of domains): - * The triple nested loops below over i, j, k scan over the - * load balanced cpusets (using the array of cpuset pointers in - * csa[]) looking for pairs of cpusets that have overlapping - * cpus_allowed, but which don't have the same 'pn' partition - * number and gives them in the same partition number. It keeps - * looping on the 'restart' label until it can no longer find - * any such pairs. + * The double nested loops below over i, j scan over the load + * balanced cpusets (using the array of cpuset pointers in csa[]) + * looking for pairs of cpusets that have overlapping cpus_allowed + * and merging them using a union-find algorithm. + * + * The union of the cpus_allowed masks from the set of all cpusets + * having the same root then form the one element of the partition + * (one sched domain) to be passed to partition_sched_domains(). * - * The union of the cpus_allowed masks from the set of - * all cpusets having the same 'pn' value then form the one - * element of the partition (one sched domain) to be passed to - * partition_sched_domains(). */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) @@ -1015,7 +721,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ - int i, j, k; /* indices for partition finding loops */ + int i, j; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ @@ -1023,6 +729,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + int nslot_update; doms = NULL; dattr = NULL; @@ -1111,32 +818,28 @@ v2: goto single_root_domain; for (i = 0; i < csn; i++) - csa[i]->pn = i; - ndoms = csn; + uf_node_init(&csa[i]->node); -restart: - /* Find the best partition (set of sched domains) */ + /* Merge overlapping cpusets */ for (i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - int apn = a->pn; - - for (j = 0; j < csn; j++) { - struct cpuset *b = csa[j]; - int bpn = b->pn; - - if (apn != bpn && cpusets_overlap(a, b)) { - for (k = 0; k < csn; k++) { - struct cpuset *c = csa[k]; - - if (c->pn == bpn) - c->pn = apn; - } - ndoms--; /* one less element */ - goto restart; + for (j = i + 1; j < csn; j++) { + if (cpusets_overlap(csa[i], csa[j])) { + /* + * Cgroup v2 shouldn't pass down overlapping + * partition root cpusets. + */ + WARN_ON_ONCE(cgrpv2); + uf_union(&csa[i]->node, &csa[j]->node); } } } + /* Count the total number of domains */ + for (i = 0; i < csn; i++) { + if (uf_find(&csa[i]->node) == &csa[i]->node) + ndoms++; + } + /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. @@ -1167,44 +870,25 @@ restart: } for (nslot = 0, i = 0; i < csn; i++) { - struct cpuset *a = csa[i]; - struct cpumask *dp; - int apn = a->pn; - - if (apn < 0) { - /* Skip completed partitions */ - continue; - } - - dp = doms[nslot]; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; - } - - cpumask_clear(dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; + nslot_update = 0; for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpumask_or(dp, dp, b->effective_cpus); + if (uf_find(&csa[j]->node) == &csa[i]->node) { + struct cpumask *dp = doms[nslot]; + + if (i == j) { + nslot_update = 1; + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + } + cpumask_or(dp, dp, csa[j]->effective_cpus); cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) - update_domain_attr_tree(dattr + nslot, b); - - /* Done with this partition */ - b->pn = -1; + update_domain_attr_tree(dattr + nslot, csa[j]); } } - nslot++; + if (nslot_update) + nslot++; } BUG_ON(nslot != ndoms); @@ -1296,7 +980,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * * Call with cpuset_mutex held. Takes cpus_read_lock(). */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; @@ -1348,7 +1032,7 @@ static void rebuild_sched_domains_locked(void) partition_and_rebuild_sched_domains(ndoms, doms, attr); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ @@ -1368,7 +1052,7 @@ void rebuild_sched_domains(void) } /** - * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. + * cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @new_cpus: the temp variable for the new effective_cpus mask * @@ -1378,7 +1062,7 @@ void rebuild_sched_domains(void) * is used instead of effective_cpus to make sure all offline CPUs are also * included as hotplug code won't update cpumasks for tasks in top_cpuset. */ -static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) +void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { struct css_task_iter it; struct task_struct *task; @@ -1428,8 +1112,6 @@ enum partition_cmd { partcmd_invalidate, /* Make partition invalid */ }; -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, - int turning_on); static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp); @@ -1443,11 +1125,11 @@ static int update_partition_exclusive(struct cpuset *cs, int new_prs) bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { - if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) + if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1)) return PERR_NOTEXCL; } else if (!exclusive && is_cpu_exclusive(cs)) { /* Turning off CS_CPU_EXCLUSIVE will not return error */ - update_flag(CS_CPU_EXCLUSIVE, cs, 0); + cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0); } return 0; } @@ -1516,12 +1198,8 @@ static void reset_partition_data(struct cpuset *cs) if (is_cpu_exclusive(cs)) clear_bit(CS_CPU_EXCLUSIVE, &cs->flags); } - if (!cpumask_and(cs->effective_cpus, - parent->effective_cpus, cs->cpus_allowed)) { - cs->use_parent_ecpus = true; - parent->child_ecpus_count++; + if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed)) cpumask_copy(cs->effective_cpus, parent->effective_cpus); - } } /* @@ -1662,7 +1340,7 @@ static inline bool is_local_partition(struct cpuset *cs) * @cs: the cpuset to update * @new_prs: new partition_root_state * @tmp: temparary masks - * Return: 1 if successful, 0 if error + * Return: 0 if successful, errcode if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. @@ -1676,7 +1354,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, * The user must have sysadmin privilege. */ if (!capable(CAP_SYS_ADMIN)) - return 0; + return PERR_ACCESS; /* * The requested exclusive_cpus must not be allocated to other @@ -1690,26 +1368,20 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, if (cpumask_empty(tmp->new_cpus) || cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) - return 0; + return PERR_INVCPUS; spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - - cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); - return 1; + return 0; } /* @@ -1743,7 +1415,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); } @@ -1795,7 +1467,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, /* * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. */ - update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); return; @@ -1850,15 +1522,15 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * - * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in - * an isolated partition. + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { - const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN); - bool all_in_hk = cpumask_subset(new_cpus, hk_domain); + if (!have_boot_isolcpus) + return false; - if (!all_in_hk && (prstate != PRS_ISOLATED)) + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) return true; return false; @@ -2167,7 +1839,7 @@ write_error: update_partition_exclusive(cs, new_prs); if (adding || deleting) { - update_tasks_cpumask(parent, tmp->addmask); + cpuset_update_tasks_cpumask(parent, tmp->addmask); update_sibling_cpumasks(parent, cs, tmp); } @@ -2325,17 +1997,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * it is a partition root that has explicitly distributed * out all its CPUs. */ - if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { + if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) cpumask_copy(tmp->new_cpus, parent->effective_cpus); - if (!cp->use_parent_ecpus) { - cp->use_parent_ecpus = true; - parent->child_ecpus_count++; - } - } else if (cp->use_parent_ecpus) { - cp->use_parent_ecpus = false; - WARN_ON_ONCE(!parent->child_ecpus_count); - parent->child_ecpus_count--; - } if (remote) goto get_css; @@ -2359,7 +2022,7 @@ update_parent_effective: /* * update_parent_effective_cpumask() should have been called * for cs already in update_cpumask(). We should also call - * update_tasks_cpumask() again for tasks in the parent + * cpuset_update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's effective_cpus changes. */ if ((cp != cs) && old_prs) { @@ -2416,7 +2079,7 @@ get_css: WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); - update_tasks_cpumask(cp, cp->effective_cpus); + cpuset_update_tasks_cpumask(cp, cp->effective_cpus); /* * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE @@ -2472,8 +2135,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * Check all its siblings and call update_cpumasks_hier() * if their effective_cpus will need to be changed. * - * With the addition of effective_xcpus which is a subset of - * cpus_allowed. It is possible a change in parent's effective_cpus + * It is possible a change in parent's effective_cpus * due to a change in a child partition's effective_xcpus will impact * its siblings even if they do not inherit parent's effective_cpus * directly. @@ -2487,8 +2149,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, cpuset_for_each_child(sibling, pos_css, parent) { if (sibling == cs) continue; - if (!sibling->use_parent_ecpus && - !is_partition_valid(sibling)) { + if (!is_partition_valid(sibling)) { compute_effective_cpumask(tmp->new_cpus, sibling, parent); if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus)) @@ -2598,7 +2259,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, invalidate = true; rcu_read_lock(); cpuset_for_each_child(cp, css, parent) { - struct cpumask *xcpus = fetch_xcpus(trialcs); + struct cpumask *xcpus = user_xcpus(trialcs); if (is_partition_valid(cp) && cpumask_intersects(xcpus, cp->effective_xcpus)) { @@ -2845,14 +2506,14 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void *cpuset_being_rebound; /** - * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. + * cpuset_update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs) +void cpuset_update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct css_task_iter it; @@ -2950,7 +2611,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); - update_tasks_nodemask(cp); + cpuset_update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -3036,44 +2697,8 @@ bool current_cpuset_is_being_rebound(void) return ret; } -static int update_relax_domain_level(struct cpuset *cs, s64 val) -{ -#ifdef CONFIG_SMP - if (val < -1 || val > sched_domain_level_max + 1) - return -EINVAL; -#endif - - if (val != cs->relax_domain_level) { - cs->relax_domain_level = val; - if (!cpumask_empty(cs->cpus_allowed) && - is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); - } - - return 0; -} - -/** - * update_tasks_flags - update the spread flags of tasks in the cpuset. - * @cs: the cpuset in which each task's spread flags needs to be changed - * - * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays - * stable. - */ -static void update_tasks_flags(struct cpuset *cs) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flags(cs, task); - css_task_iter_end(&it); -} - /* - * update_flag - read a 0 or a 1 in a file and update associated flag + * cpuset_update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared @@ -3081,7 +2706,7 @@ static void update_tasks_flags(struct cpuset *cs) * Call with cpuset_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, +int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { struct cpuset *trialcs; @@ -3117,7 +2742,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, rebuild_sched_domains_locked(); if (spread_flag_changed) - update_tasks_flags(cs); + cpuset1_update_tasks_flags(cs); out: free_cpuset(trialcs); return err; @@ -3166,9 +2791,6 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; if (!old_prs) { - enum partition_cmd cmd = (new_prs == PRS_ROOT) - ? partcmd_enable : partcmd_enablei; - /* * cpus_allowed and exclusive_cpus cannot be both empty. */ @@ -3177,13 +2799,18 @@ static int update_prstate(struct cpuset *cs, int new_prs) goto out; } - err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); /* - * If an attempt to become local partition root fails, - * try to become a remote partition root instead. + * If parent is valid partition, enable local partiion. + * Otherwise, enable a remote partition. */ - if (err && remote_partition_enable(cs, new_prs, &tmpmask)) - err = 0; + if (is_partition_valid(parent)) { + enum partition_cmd cmd = (new_prs == PRS_ROOT) + ? partcmd_enable : partcmd_enablei; + + err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask); + } else { + err = remote_partition_enable(cs, new_prs, &tmpmask); + } } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. @@ -3236,107 +2863,6 @@ out: return 0; } -/* - * Frequency meter - How fast is some event occurring? - * - * These routines manage a digitally filtered, constant time based, - * event frequency meter. There are four routines: - * fmeter_init() - initialize a frequency meter. - * fmeter_markevent() - called each time the event happens. - * fmeter_getrate() - returns the recent rate of such events. - * fmeter_update() - internal routine used to update fmeter. - * - * A common data structure is passed to each of these routines, - * which is used to keep track of the state required to manage the - * frequency meter and its digital filter. - * - * The filter works on the number of events marked per unit time. - * The filter is single-pole low-pass recursive (IIR). The time unit - * is 1 second. Arithmetic is done using 32-bit integers scaled to - * simulate 3 decimal digits of precision (multiplied by 1000). - * - * With an FM_COEF of 933, and a time base of 1 second, the filter - * has a half-life of 10 seconds, meaning that if the events quit - * happening, then the rate returned from the fmeter_getrate() - * will be cut in half each 10 seconds, until it converges to zero. - * - * It is not worth doing a real infinitely recursive filter. If more - * than FM_MAXTICKS ticks have elapsed since the last filter event, - * just compute FM_MAXTICKS ticks worth, by which point the level - * will be stable. - * - * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid - * arithmetic overflow in the fmeter_update() routine. - * - * Given the simple 32 bit integer arithmetic used, this meter works - * best for reporting rates between one per millisecond (msec) and - * one per 32 (approx) seconds. At constant rates faster than one - * per msec it maxes out at values just under 1,000,000. At constant - * rates between one per msec, and one per second it will stabilize - * to a value N*1000, where N is the rate of events per second. - * At constant rates between one per second and one per 32 seconds, - * it will be choppy, moving up on the seconds that have an event, - * and then decaying until the next event. At rates slower than - * about one in 32 seconds, it decays all the way back to zero between - * each event. - */ - -#define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ -#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ -#define FM_SCALE 1000 /* faux fixed point scale */ - -/* Initialize a frequency meter */ -static void fmeter_init(struct fmeter *fmp) -{ - fmp->cnt = 0; - fmp->val = 0; - fmp->time = 0; - spin_lock_init(&fmp->lock); -} - -/* Internal meter update - process cnt events and update value */ -static void fmeter_update(struct fmeter *fmp) -{ - time64_t now; - u32 ticks; - - now = ktime_get_seconds(); - ticks = now - fmp->time; - - if (ticks == 0) - return; - - ticks = min(FM_MAXTICKS, ticks); - while (ticks-- > 0) - fmp->val = (FM_COEF * fmp->val) / FM_SCALE; - fmp->time = now; - - fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; - fmp->cnt = 0; -} - -/* Process any previous ticks, then bump cnt by one (times scale). */ -static void fmeter_markevent(struct fmeter *fmp) -{ - spin_lock(&fmp->lock); - fmeter_update(fmp); - fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); - spin_unlock(&fmp->lock); -} - -/* Process any previous ticks, then return current value. */ -static int fmeter_getrate(struct fmeter *fmp) -{ - int val; - - spin_lock(&fmp->lock); - fmeter_update(fmp); - val = fmp->val; - spin_unlock(&fmp->lock); - return val; -} - static struct cpuset *cpuset_attach_old_cs; /* @@ -3445,9 +2971,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) cs = css_cs(css); mutex_lock(&cpuset_mutex); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); + dec_attach_in_progress_locked(cs); if (cs->nr_migrate_dl_tasks) { int cpu = cpumask_any(cs->effective_cpus); @@ -3483,7 +3007,7 @@ static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flags(cs, task); + cpuset1_update_task_spread_flags(cs, task); } static void cpuset_attach(struct cgroup_taskset *tset) @@ -3562,116 +3086,15 @@ out: reset_migrate_dl_data(cs); } - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - - mutex_unlock(&cpuset_mutex); -} - -/* The various types of files and directories in a cpuset file system */ - -typedef enum { - FILE_MEMORY_MIGRATE, - FILE_CPULIST, - FILE_MEMLIST, - FILE_EFFECTIVE_CPULIST, - FILE_EFFECTIVE_MEMLIST, - FILE_SUBPARTS_CPULIST, - FILE_EXCLUSIVE_CPULIST, - FILE_EFFECTIVE_XCPULIST, - FILE_ISOLATED_CPULIST, - FILE_CPU_EXCLUSIVE, - FILE_MEM_EXCLUSIVE, - FILE_MEM_HARDWALL, - FILE_SCHED_LOAD_BALANCE, - FILE_PARTITION_ROOT, - FILE_SCHED_RELAX_DOMAIN_LEVEL, - FILE_MEMORY_PRESSURE_ENABLED, - FILE_MEMORY_PRESSURE, - FILE_SPREAD_PAGE, - FILE_SPREAD_SLAB, -} cpuset_filetype_t; - -static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = 0; + dec_attach_in_progress_locked(cs); - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) { - retval = -ENODEV; - goto out_unlock; - } - - switch (type) { - case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); - break; - case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); - break; - case FILE_MEM_HARDWALL: - retval = update_flag(CS_MEM_HARDWALL, cs, val); - break; - case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); - break; - case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, val); - break; - case FILE_MEMORY_PRESSURE_ENABLED: - cpuset_memory_pressure_enabled = !!val; - break; - case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, val); - break; - case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - return retval; -} - -static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, - s64 val) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - int retval = -ENODEV; - - cpus_read_lock(); - mutex_lock(&cpuset_mutex); - if (!is_cpuset_online(cs)) - goto out_unlock; - - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - retval = update_relax_domain_level(cs, val); - break; - default: - retval = -EINVAL; - break; - } -out_unlock: - mutex_unlock(&cpuset_mutex); - cpus_read_unlock(); - return retval; } /* * Common handling for a write to a "cpus" or "mems" file. */ -static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, +ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cpuset *cs = css_cs(of_css(of)); @@ -3746,7 +3169,7 @@ out_unlock: * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ -static int cpuset_common_seq_show(struct seq_file *sf, void *v) +int cpuset_common_seq_show(struct seq_file *sf, void *v) { struct cpuset *cs = css_cs(seq_css(sf)); cpuset_filetype_t type = seq_cft(sf)->private; @@ -3787,52 +3210,6 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) return ret; } -static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_CPU_EXCLUSIVE: - return is_cpu_exclusive(cs); - case FILE_MEM_EXCLUSIVE: - return is_mem_exclusive(cs); - case FILE_MEM_HARDWALL: - return is_mem_hardwall(cs); - case FILE_SCHED_LOAD_BALANCE: - return is_sched_load_balance(cs); - case FILE_MEMORY_MIGRATE: - return is_memory_migrate(cs); - case FILE_MEMORY_PRESSURE_ENABLED: - return cpuset_memory_pressure_enabled; - case FILE_MEMORY_PRESSURE: - return fmeter_getrate(&cs->fmeter); - case FILE_SPREAD_PAGE: - return is_spread_page(cs); - case FILE_SPREAD_SLAB: - return is_spread_slab(cs); - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - -static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - switch (type) { - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - return cs->relax_domain_level; - default: - BUG(); - } - - /* Unreachable but makes gcc happy */ - return 0; -} - static int sched_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); @@ -3897,113 +3274,6 @@ out_unlock: } /* - * for the common functions, 'private' gives the type of file - */ - -static struct cftype legacy_files[] = { - { - .name = "cpus", - .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * NR_CPUS), - .private = FILE_CPULIST, - }, - - { - .name = "mems", - .seq_show = cpuset_common_seq_show, - .write = cpuset_write_resmask, - .max_write_len = (100U + 6 * MAX_NUMNODES), - .private = FILE_MEMLIST, - }, - - { - .name = "effective_cpus", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_CPULIST, - }, - - { - .name = "effective_mems", - .seq_show = cpuset_common_seq_show, - .private = FILE_EFFECTIVE_MEMLIST, - }, - - { - .name = "cpu_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_CPU_EXCLUSIVE, - }, - - { - .name = "mem_exclusive", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_EXCLUSIVE, - }, - - { - .name = "mem_hardwall", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEM_HARDWALL, - }, - - { - .name = "sched_load_balance", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SCHED_LOAD_BALANCE, - }, - - { - .name = "sched_relax_domain_level", - .read_s64 = cpuset_read_s64, - .write_s64 = cpuset_write_s64, - .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, - }, - - { - .name = "memory_migrate", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_MIGRATE, - }, - - { - .name = "memory_pressure", - .read_u64 = cpuset_read_u64, - .private = FILE_MEMORY_PRESSURE, - }, - - { - .name = "memory_spread_page", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_PAGE, - }, - - { - /* obsolete, may be removed in the future */ - .name = "memory_spread_slab", - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_SPREAD_SLAB, - }, - - { - .name = "memory_pressure_enabled", - .flags = CFTYPE_ONLY_ON_ROOT, - .read_u64 = cpuset_read_u64, - .write_u64 = cpuset_write_u64, - .private = FILE_MEMORY_PRESSURE_ENABLED, - }, - - { } /* terminate */ -}; - -/* * This is currently a minimal set for the default hierarchy. It can be * expanded later on by migrating more features and control files from v1. */ @@ -4150,8 +3420,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; - cs->use_parent_ecpus = true; - parent->child_ecpus_count++; } spin_unlock_irq(&callback_lock); @@ -4215,14 +3483,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - - if (cs->use_parent_ecpus) { - struct cpuset *parent = parent_cs(cs); - - cs->use_parent_ecpus = false; - parent->child_ecpus_count--; - } + cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); @@ -4312,11 +3573,7 @@ static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) if (same_cs) return; - mutex_lock(&cpuset_mutex); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - mutex_unlock(&cpuset_mutex); + dec_attach_in_progress(cs); } /* @@ -4348,10 +3605,7 @@ static void cpuset_fork(struct task_struct *task) guarantee_online_mems(cs, &cpuset_attach_nodemask_to); cpuset_attach_task(cs, task); - cs->attach_in_progress--; - if (!cs->attach_in_progress) - wake_up(&cpuset_attach_wq); - + dec_attach_in_progress_locked(cs); mutex_unlock(&cpuset_mutex); } @@ -4368,7 +3622,9 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .can_fork = cpuset_can_fork, .cancel_fork = cpuset_cancel_fork, .fork = cpuset_fork, - .legacy_cftypes = legacy_files, +#ifdef CONFIG_CPUSETS_V1 + .legacy_cftypes = cpuset1_files, +#endif .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, @@ -4401,91 +3657,14 @@ int __init cpuset_init(void) BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); - return 0; -} - -/* - * If CPU and/or memory hotplug handlers, below, unplug any CPUs - * or memory nodes, we need to walk over the cpuset hierarchy, - * removing that CPU or node from all cpusets. If this removes the - * last CPU or node from a cpuset, then move the tasks in the empty - * cpuset to its next-highest non-empty parent. - */ -static void remove_tasks_in_empty_cpuset(struct cpuset *cs) -{ - struct cpuset *parent; - - /* - * Find its next-highest non-empty parent, (top cpuset - * has online cpus, so can't be empty). - */ - parent = parent_cs(cs); - while (cpumask_empty(parent->cpus_allowed) || - nodes_empty(parent->mems_allowed)) - parent = parent_cs(parent); - - if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - pr_err("cpuset: failed to transfer tasks out of empty cpuset "); - pr_cont_cgroup_name(cs->css.cgroup); - pr_cont("\n"); + have_boot_isolcpus = housekeeping_enabled(HK_TYPE_DOMAIN); + if (have_boot_isolcpus) { + BUG_ON(!alloc_cpumask_var(&boot_hk_cpus, GFP_KERNEL)); + cpumask_copy(boot_hk_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(isolated_cpus, cpu_possible_mask, boot_hk_cpus); } -} - -static void cpuset_migrate_tasks_workfn(struct work_struct *work) -{ - struct cpuset_remove_tasks_struct *s; - - s = container_of(work, struct cpuset_remove_tasks_struct, work); - remove_tasks_in_empty_cpuset(s->cs); - css_put(&s->cs->css); - kfree(s); -} - -static void -hotplug_update_tasks_legacy(struct cpuset *cs, - struct cpumask *new_cpus, nodemask_t *new_mems, - bool cpus_updated, bool mems_updated) -{ - bool is_empty; - - spin_lock_irq(&callback_lock); - cpumask_copy(cs->cpus_allowed, new_cpus); - cpumask_copy(cs->effective_cpus, new_cpus); - cs->mems_allowed = *new_mems; - cs->effective_mems = *new_mems; - spin_unlock_irq(&callback_lock); - - /* - * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migrated to an ancestor. - */ - if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) - update_tasks_cpumask(cs, new_cpus); - if (mems_updated && !nodes_empty(cs->mems_allowed)) - update_tasks_nodemask(cs); - - is_empty = cpumask_empty(cs->cpus_allowed) || - nodes_empty(cs->mems_allowed); - - /* - * Move tasks to the nearest ancestor with execution resources, - * This is full cgroup operation which will also call back into - * cpuset. Execute it asynchronously using workqueue. - */ - if (is_empty && cs->css.cgroup->nr_populated_csets && - css_tryget_online(&cs->css)) { - struct cpuset_remove_tasks_struct *s; - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (WARN_ON_ONCE(!s)) { - css_put(&cs->css); - return; - } - - s->cs = cs; - INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); - schedule_work(&s->work); - } + return 0; } static void @@ -4505,9 +3684,9 @@ hotplug_update_tasks(struct cpuset *cs, spin_unlock_irq(&callback_lock); if (cpus_updated) - update_tasks_cpumask(cs, new_cpus); + cpuset_update_tasks_cpumask(cs, new_cpus); if (mems_updated) - update_tasks_nodemask(cs); + cpuset_update_tasks_nodemask(cs); } void cpuset_force_rebuild(void) @@ -4608,7 +3787,7 @@ update_tasks: hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else - hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, + cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); unlock: @@ -4693,7 +3872,7 @@ static void cpuset_handle_hotplug(void) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; spin_unlock_irq(&callback_lock); - update_tasks_nodemask(&top_cpuset); + cpuset_update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); @@ -5033,19 +4212,6 @@ int cpuset_mem_spread_node(void) } /** - * cpuset_slab_spread_node() - On which node to begin search for a slab page - */ -int cpuset_slab_spread_node(void) -{ - if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) - current->cpuset_slab_spread_rotor = - node_random(¤t->mems_allowed); - - return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); -} -EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); - -/** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. * @tsk2: pointer to task_struct of some other task. @@ -5083,39 +4249,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -/* - * Collection of memory_pressure is suppressed unless - * this flag is enabled by writing "1" to the special - * cpuset file 'memory_pressure_enabled' in the root cpuset. - */ - -int cpuset_memory_pressure_enabled __read_mostly; - -/* - * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. - * - * Keep a running average of the rate of synchronous (direct) - * page reclaim efforts initiated by tasks in each cpuset. - * - * This represents the rate at which some task in the cpuset - * ran low on memory on all nodes it was allowed to use, and - * had to enter the kernels page reclaim code in an effort to - * create more free memory by tossing clean pages or swapping - * or writing dirty pages. - * - * Display to user space in the per-cpuset read-only file - * "memory_pressure". Value displayed is an integer - * representing the recent rate of entry into the synchronous - * (direct) page reclaim by any task attached to the cpuset. - */ - -void __cpuset_memory_pressure_bump(void) -{ - rcu_read_lock(); - fmeter_markevent(&task_cs(current)->fmeter); - rcu_read_unlock(); -} - #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index f5cb0ec45b9d..8f61114c36dd 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -244,7 +244,6 @@ static void pids_event(struct pids_cgroup *pids_forking, struct pids_cgroup *pids_over_limit) { struct pids_cgroup *p = pids_forking; - bool limit = false; /* Only log the first time limit is hit. */ if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) { @@ -252,20 +251,17 @@ static void pids_event(struct pids_cgroup *pids_forking, pr_cont_cgroup_path(p->css.cgroup); pr_cont("\n"); } - cgroup_file_notify(&p->events_local_file); if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || - cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) { + cgroup_file_notify(&p->events_local_file); return; + } - for (; parent_pids(p); p = parent_pids(p)) { - if (p == pids_over_limit) { - limit = true; - atomic64_inc(&p->events_local[PIDCG_MAX]); - cgroup_file_notify(&p->events_local_file); - } - if (limit) - atomic64_inc(&p->events[PIDCG_MAX]); + atomic64_inc(&pids_over_limit->events_local[PIDCG_MAX]); + cgroup_file_notify(&pids_over_limit->events_local_file); + for (p = pids_over_limit; parent_pids(p); p = parent_pids(p)) { + atomic64_inc(&p->events[PIDCG_MAX]); cgroup_file_notify(&p->events_file); } } @@ -276,15 +272,10 @@ static void pids_event(struct pids_cgroup *pids_forking, */ static int pids_can_fork(struct task_struct *task, struct css_set *cset) { - struct cgroup_subsys_state *css; struct pids_cgroup *pids, *pids_over_limit; int err; - if (cset) - css = cset->subsys[pids_cgrp_id]; - else - css = task_css_check(current, pids_cgrp_id, true); - pids = css_pids(css); + pids = css_pids(cset->subsys[pids_cgrp_id]); err = pids_try_charge(pids, 1, &pids_over_limit); if (err) pids_event(pids, pids_over_limit); @@ -294,14 +285,9 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset) static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) { - struct cgroup_subsys_state *css; struct pids_cgroup *pids; - if (cset) - css = cset->subsys[pids_cgrp_id]; - else - css = task_css_check(current, pids_cgrp_id, true); - pids = css_pids(css); + pids = css_pids(cset->subsys[pids_cgrp_id]); pids_uncharge(pids, 1); } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 24b1e1143260..938c48952d26 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -28,34 +28,34 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #ifdef CONFIG_CONTEXT_TRACKING_IDLE - .dynticks_nesting = 1, - .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, + .nesting = 1, + .nmi_nesting = CT_NESTING_IRQ_NONIDLE, #endif - .state = ATOMIC_INIT(RCU_DYNTICKS_IDX), + .state = ATOMIC_INIT(CT_RCU_WATCHING), }; EXPORT_SYMBOL_GPL(context_tracking); #ifdef CONFIG_CONTEXT_TRACKING_IDLE #define TPS(x) tracepoint_string(x) -/* Record the current task on dyntick-idle entry. */ -static __always_inline void rcu_dynticks_task_enter(void) +/* Record the current task on exiting RCU-tasks (dyntick-idle entry). */ +static __always_inline void rcu_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } -/* Record no current task on dyntick-idle exit. */ -static __always_inline void rcu_dynticks_task_exit(void) +/* Record no current task on entering RCU-tasks (dyntick-idle exit). */ +static __always_inline void rcu_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ } -/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ -static __always_inline void rcu_dynticks_task_trace_enter(void) +/* Turn on heavyweight RCU tasks trace readers on kernel exit. */ +static __always_inline void rcu_task_trace_heavyweight_enter(void) { #ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) @@ -63,8 +63,8 @@ static __always_inline void rcu_dynticks_task_trace_enter(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ -static __always_inline void rcu_dynticks_task_trace_exit(void) +/* Turn off heavyweight RCU tasks trace readers on kernel entry. */ +static __always_inline void rcu_task_trace_heavyweight_exit(void) { #ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) @@ -87,10 +87,10 @@ static noinstr void ct_kernel_exit_state(int offset) * critical sections, and we also must force ordering with the * next idle sojourn. */ - rcu_dynticks_task_trace_enter(); // Before ->dynticks update! + rcu_task_trace_heavyweight_enter(); // Before CT state update! seq = ct_state_inc(offset); // RCU is no longer watching. Better be in extended quiescent state! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX)); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING)); } /* @@ -109,15 +109,15 @@ static noinstr void ct_kernel_enter_state(int offset) */ seq = ct_state_inc(offset); // RCU is now watching. Better not be in an extended quiescent state! - rcu_dynticks_task_trace_exit(); // After ->dynticks update! - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX)); + rcu_task_trace_heavyweight_exit(); // After CT state update! + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING)); } /* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * - * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * We crowbar the ->nmi_nesting field to zero to allow for * the possibility of usermode upcalls having messed up our count * of interrupt nesting level during the prior busy period. */ @@ -125,19 +125,19 @@ static void noinstr ct_kernel_exit(bool user, int offset) { struct context_tracking *ct = this_cpu_ptr(&context_tracking); - WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); - WRITE_ONCE(ct->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(ct_nmi_nesting() != CT_NESTING_IRQ_NONIDLE); + WRITE_ONCE(ct->nmi_nesting, 0); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - ct_dynticks_nesting() == 0); - if (ct_dynticks_nesting() != 1) { + ct_nesting() == 0); + if (ct_nesting() != 1) { // RCU will still be watching, so just do accounting and leave. - ct->dynticks_nesting--; + ct->nesting--; return; } instrumentation_begin(); lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), ct_dynticks_nesting(), 0, ct_dynticks()); + trace_rcu_watching(TPS("End"), ct_nesting(), 0, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rcu_preempt_deferred_qs(current); @@ -145,18 +145,18 @@ static void noinstr ct_kernel_exit(bool user, int offset) instrument_atomic_write(&ct->state, sizeof(ct->state)); instrumentation_end(); - WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + WRITE_ONCE(ct->nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... ct_kernel_exit_state(offset); // ... but is no longer watching here. - rcu_dynticks_task_enter(); + rcu_task_exit(); } /* * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * - * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to + * We crowbar the ->nmi_nesting field to CT_NESTING_IRQ_NONIDLE to * allow for the possibility of usermode upcalls messing up our count of * interrupt nesting level during the busy period that is just now starting. */ @@ -166,14 +166,14 @@ static void noinstr ct_kernel_enter(bool user, int offset) long oldval; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - oldval = ct_dynticks_nesting(); + oldval = ct_nesting(); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); if (oldval) { // RCU was already watching, so just do accounting and leave. - ct->dynticks_nesting++; + ct->nesting++; return; } - rcu_dynticks_task_exit(); + rcu_task_enter(); // RCU is not watching here ... ct_kernel_enter_state(offset); // ... but is watching here. @@ -182,11 +182,11 @@ static void noinstr ct_kernel_enter(bool user, int offset) // instrumentation for the noinstr ct_kernel_enter_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); - trace_rcu_dyntick(TPS("End"), ct_dynticks_nesting(), 1, ct_dynticks()); + trace_rcu_watching(TPS("Start"), ct_nesting(), 1, ct_rcu_watching()); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - WRITE_ONCE(ct->dynticks_nesting, 1); - WARN_ON_ONCE(ct_dynticks_nmi_nesting()); - WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); + WRITE_ONCE(ct->nesting, 1); + WARN_ON_ONCE(ct_nmi_nesting()); + WRITE_ONCE(ct->nmi_nesting, CT_NESTING_IRQ_NONIDLE); instrumentation_end(); } @@ -194,7 +194,7 @@ static void noinstr ct_kernel_enter(bool user, int offset) * ct_nmi_exit - inform RCU of exit from NMI context * * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting + * RCU-idle period, update ct->state and ct->nmi_nesting * to let the RCU grace-period handling know that the CPU is back to * being RCU-idle. * @@ -207,47 +207,47 @@ void noinstr ct_nmi_exit(void) instrumentation_begin(); /* - * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * Check for ->nmi_nesting underflow and bad CT state. * (We are exiting an NMI handler, so RCU better be paying attention * to us!) */ - WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0); - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + WARN_ON_ONCE(ct_nmi_nesting() <= 0); + WARN_ON_ONCE(!rcu_is_watching_curr_cpu()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so * leave it in non-RCU-idle state. */ - if (ct_dynticks_nmi_nesting() != 1) { - trace_rcu_dyntick(TPS("--="), ct_dynticks_nmi_nesting(), ct_dynticks_nmi_nesting() - 2, - ct_dynticks()); - WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */ - ct_dynticks_nmi_nesting() - 2); + if (ct_nmi_nesting() != 1) { + trace_rcu_watching(TPS("--="), ct_nmi_nesting(), ct_nmi_nesting() - 2, + ct_rcu_watching()); + WRITE_ONCE(ct->nmi_nesting, /* No store tearing. */ + ct_nmi_nesting() - 2); instrumentation_end(); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), ct_dynticks_nmi_nesting(), 0, ct_dynticks()); - WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + trace_rcu_watching(TPS("Endirq"), ct_nmi_nesting(), 0, ct_rcu_watching()); + WRITE_ONCE(ct->nmi_nesting, 0); /* Avoid store tearing. */ // instrumentation for the noinstr ct_kernel_exit_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); instrumentation_end(); // RCU is watching here ... - ct_kernel_exit_state(RCU_DYNTICKS_IDX); + ct_kernel_exit_state(CT_RCU_WATCHING); // ... but is no longer watching here. if (!in_nmi()) - rcu_dynticks_task_enter(); + rcu_task_exit(); } /** * ct_nmi_enter - inform RCU of entry to NMI context * * If the CPU was idle from RCU's viewpoint, update ct->state and - * ct->dynticks_nmi_nesting to let the RCU grace-period handling know + * ct->nmi_nesting to let the RCU grace-period handling know * that the CPU is active. This implementation permits nested NMIs, as * long as the nesting level does not overflow an int. (You will probably * run out of stack space first.) @@ -261,27 +261,27 @@ void noinstr ct_nmi_enter(void) struct context_tracking *ct = this_cpu_ptr(&context_tracking); /* Complain about underflow. */ - WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0); + WARN_ON_ONCE(ct_nmi_nesting() < 0); /* - * If idle from RCU viewpoint, atomically increment ->dynticks - * to mark non-idle and increment ->dynticks_nmi_nesting by one. - * Otherwise, increment ->dynticks_nmi_nesting by two. This means - * if ->dynticks_nmi_nesting is equal to one, we are guaranteed + * If idle from RCU viewpoint, atomically increment CT state + * to mark non-idle and increment ->nmi_nesting by one. + * Otherwise, increment ->nmi_nesting by two. This means + * if ->nmi_nesting is equal to one, we are guaranteed * to be in the outermost NMI handler that interrupted an RCU-idle * period (observation due to Andy Lutomirski). */ - if (rcu_dynticks_curr_cpu_in_eqs()) { + if (!rcu_is_watching_curr_cpu()) { if (!in_nmi()) - rcu_dynticks_task_exit(); + rcu_task_enter(); // RCU is not watching here ... - ct_kernel_enter_state(RCU_DYNTICKS_IDX); + ct_kernel_enter_state(CT_RCU_WATCHING); // ... but is watching here. instrumentation_begin(); - // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() + // instrumentation for the noinstr rcu_is_watching_curr_cpu() instrument_atomic_read(&ct->state, sizeof(ct->state)); // instrumentation for the noinstr ct_kernel_enter_state() instrument_atomic_write(&ct->state, sizeof(ct->state)); @@ -294,12 +294,12 @@ void noinstr ct_nmi_enter(void) instrumentation_begin(); } - trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), - ct_dynticks_nmi_nesting(), - ct_dynticks_nmi_nesting() + incby, ct_dynticks()); + trace_rcu_watching(incby == 1 ? TPS("Startirq") : TPS("++="), + ct_nmi_nesting(), + ct_nmi_nesting() + incby, ct_rcu_watching()); instrumentation_end(); - WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */ - ct_dynticks_nmi_nesting() + incby); + WRITE_ONCE(ct->nmi_nesting, /* Prevent store tearing. */ + ct_nmi_nesting() + incby); barrier(); } @@ -317,7 +317,7 @@ void noinstr ct_nmi_enter(void) void noinstr ct_idle_enter(void) { WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); - ct_kernel_exit(false, RCU_DYNTICKS_IDX + CONTEXT_IDLE); + ct_kernel_exit(false, CT_RCU_WATCHING + CT_STATE_IDLE); } EXPORT_SYMBOL_GPL(ct_idle_enter); @@ -335,7 +335,7 @@ void noinstr ct_idle_exit(void) unsigned long flags; raw_local_irq_save(flags); - ct_kernel_enter(false, RCU_DYNTICKS_IDX - CONTEXT_IDLE); + ct_kernel_enter(false, CT_RCU_WATCHING - CT_STATE_IDLE); raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ct_idle_exit); @@ -485,7 +485,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * user_exit() or ct_irq_enter(). Let's remove RCU's dependency * on the tick. */ - if (state == CONTEXT_USER) { + if (state == CT_STATE_USER) { instrumentation_begin(); trace_user_enter(0); vtime_user_enter(current); @@ -504,7 +504,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * CPU doesn't need to maintain the tick for RCU maintenance purposes * when the CPU runs in userspace. */ - ct_kernel_exit(true, RCU_DYNTICKS_IDX + state); + ct_kernel_exit(true, CT_RCU_WATCHING + state); /* * Special case if we only track user <-> kernel transitions for tickless @@ -534,7 +534,7 @@ void noinstr __ct_user_enter(enum ctx_state state) /* * Tracking for vtime and RCU EQS. Make sure we don't race * with NMIs. OTOH we don't care about ordering here since - * RCU only requires RCU_DYNTICKS_IDX increments to be fully + * RCU only requires CT_RCU_WATCHING increments to be fully * ordered. */ raw_atomic_add(state, &ct->state); @@ -620,8 +620,8 @@ void noinstr __ct_user_exit(enum ctx_state state) * Exit RCU idle mode while entering the kernel because it can * run a RCU read side critical section anytime. */ - ct_kernel_enter(true, RCU_DYNTICKS_IDX - state); - if (state == CONTEXT_USER) { + ct_kernel_enter(true, CT_RCU_WATCHING - state); + if (state == CT_STATE_USER) { instrumentation_begin(); vtime_user_exit(current); trace_user_exit(0); @@ -634,17 +634,17 @@ void noinstr __ct_user_exit(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - raw_atomic_set(&ct->state, CONTEXT_KERNEL); + raw_atomic_set(&ct->state, CT_STATE_KERNEL); } else { if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - raw_atomic_set(&ct->state, CONTEXT_KERNEL); + raw_atomic_set(&ct->state, CT_STATE_KERNEL); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race * with NMIs. OTOH we don't care about ordering here since - * RCU only requires RCU_DYNTICKS_IDX increments to be fully + * RCU only requires CT_RCU_WATCHING increments to be fully * ordered. */ raw_atomic_sub(state, &ct->state); diff --git a/kernel/cpu.c b/kernel/cpu.c index b1fd2a3db91a..d293d52a3e00 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -330,7 +330,7 @@ static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state st /* Poll for one millisecond */ arch_cpuhp_sync_state_poll(); } else { - usleep_range_state(USEC_PER_MSEC, 2 * USEC_PER_MSEC, TASK_UNINTERRUPTIBLE); + usleep_range(USEC_PER_MSEC, 2 * USEC_PER_MSEC); } sync = atomic_read(st); } @@ -1808,6 +1808,7 @@ static int __init parallel_bringup_parse_param(char *arg) } early_param("cpuhp.parallel", parallel_bringup_parse_param); +#ifdef CONFIG_HOTPLUG_SMT static inline bool cpuhp_smt_aware(void) { return cpu_smt_max_threads > 1; @@ -1817,6 +1818,21 @@ static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) { return cpu_primary_thread_mask; } +#else +static inline bool cpuhp_smt_aware(void) +{ + return false; +} +static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) +{ + return cpu_none_mask; +} +#endif + +bool __weak arch_cpuhp_init_parallel_bringup(void) +{ + return true; +} /* * On architectures which have enabled parallel bringup this invokes all BP @@ -2689,9 +2705,7 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } -/** - * Check if the core a CPU belongs to is online - */ +/* Check if the core a CPU belongs to is online */ #if !defined(topology_is_core_online) static inline bool topology_is_core_online(unsigned int cpu) { diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index c06e56be0ca1..4c0dcd909121 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -8,8 +8,7 @@ config HAS_DMA depends on !NO_DMA default y -config DMA_OPS - depends on HAS_DMA +config DMA_OPS_HELPERS bool # @@ -109,8 +108,8 @@ config DMA_BOUNCE_UNALIGNED_KMALLOC config DMA_NEED_SYNC def_bool ARCH_HAS_SYNC_DMA_FOR_DEVICE || ARCH_HAS_SYNC_DMA_FOR_CPU || \ - ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_API_DEBUG || DMA_OPS || \ - SWIOTLB + ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_API_DEBUG || \ + ARCH_HAS_DMA_OPS || SWIOTLB config DMA_RESTRICTED_POOL bool "DMA Restricted Pool" diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 21926e46ef4f..6977033444a3 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_HAS_DMA) += mapping.o direct.o -obj-$(CONFIG_DMA_OPS) += ops_helpers.o -obj-$(CONFIG_DMA_OPS) += dummy.o +obj-$(CONFIG_DMA_OPS_HELPERS) += ops_helpers.o +obj-$(CONFIG_ARCH_HAS_DMA_OPS) += dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 4480a3cd92e0..5b4e6d3bf7bc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -20,7 +20,7 @@ * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ -unsigned int zone_dma_bits __ro_after_init = 24; +u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) @@ -59,7 +59,7 @@ static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) * zones. */ *phys_limit = dma_to_phys(dev, dma_limit); - if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) + if (*phys_limit <= zone_dma_limit) return GFP_DMA; if (*phys_limit <= DMA_BIT_MASK(32)) return GFP_DMA32; @@ -140,7 +140,7 @@ again: if (!page) page = alloc_pages_node(node, gfp, get_order(size)); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_free_contiguous(dev, page, size); + __free_pages(page, get_order(size)); page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && @@ -580,7 +580,7 @@ int dma_direct_supported(struct device *dev, u64 mask) * part of the check. */ if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); + min_mask = min_t(u64, min_mask, zone_dma_limit); return mask >= phys_to_dma_unencrypted(dev, min_mask); } diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c index b492d59ac77e..92de80e5b057 100644 --- a/kernel/dma/dummy.c +++ b/kernel/dma/dummy.c @@ -17,6 +17,15 @@ static dma_addr_t dma_dummy_map_page(struct device *dev, struct page *page, { return DMA_MAPPING_ERROR; } +static void dma_dummy_unmap_page(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + /* + * Dummy ops doesn't support map_page, so unmap_page should never be + * called. + */ + WARN_ON_ONCE(true); +} static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir, @@ -25,6 +34,16 @@ static int dma_dummy_map_sg(struct device *dev, struct scatterlist *sgl, return -EINVAL; } +static void dma_dummy_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + unsigned long attrs) +{ + /* + * Dummy ops doesn't support map_sg, so unmap_sg should never be called. + */ + WARN_ON_ONCE(true); +} + static int dma_dummy_supported(struct device *hwdev, u64 mask) { return 0; @@ -33,6 +52,8 @@ static int dma_dummy_supported(struct device *hwdev, u64 mask) const struct dma_map_ops dma_dummy_ops = { .mmap = dma_dummy_mmap, .map_page = dma_dummy_map_page, + .unmap_page = dma_dummy_unmap_page, .map_sg = dma_dummy_map_sg, + .unmap_sg = dma_dummy_unmap_sg, .dma_supported = dma_dummy_supported, }; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b1c18058d55f..b839683da0ba 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -10,6 +10,7 @@ #include <linux/dma-map-ops.h> #include <linux/export.h> #include <linux/gfp.h> +#include <linux/iommu-dma.h> #include <linux/kmsan.h> #include <linux/of_device.h> #include <linux/slab.h> @@ -17,6 +18,9 @@ #include "debug.h" #include "direct.h" +#define CREATE_TRACE_POINTS +#include <trace/events/dma.h> + #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) @@ -116,8 +120,12 @@ EXPORT_SYMBOL(dmam_alloc_attrs); static bool dma_go_direct(struct device *dev, dma_addr_t mask, const struct dma_map_ops *ops) { + if (use_dma_iommu(dev)) + return false; + if (likely(!ops)) return true; + #ifdef CONFIG_DMA_OPS_BYPASS if (dev->dma_ops_bypass) return min_not_zero(mask, dev->bus_dma_limit) >= @@ -159,9 +167,13 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, if (dma_map_direct(dev, ops) || arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + else if (use_dma_iommu(dev)) + addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); kmsan_handle_dma(page, offset, size, dir); + trace_dma_map_page(dev, page_to_phys(page) + offset, addr, size, dir, + attrs); debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); return addr; @@ -177,8 +189,11 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, if (dma_map_direct(dev, ops) || arch_dma_unmap_page_direct(dev, addr + size)) dma_direct_unmap_page(dev, addr, size, dir, attrs); - else if (ops->unmap_page) + else if (use_dma_iommu(dev)) + iommu_dma_unmap_page(dev, addr, size, dir, attrs); + else ops->unmap_page(dev, addr, size, dir, attrs); + trace_dma_unmap_page(dev, addr, size, dir, attrs); debug_dma_unmap_page(dev, addr, size, dir); } EXPORT_SYMBOL(dma_unmap_page_attrs); @@ -197,11 +212,14 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, if (dma_map_direct(dev, ops) || arch_dma_map_sg_direct(dev, sg, nents)) ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); + else if (use_dma_iommu(dev)) + ents = iommu_dma_map_sg(dev, sg, nents, dir, attrs); else ents = ops->map_sg(dev, sg, nents, dir, attrs); if (ents > 0) { kmsan_handle_dma_sg(sg, nents, dir); + trace_dma_map_sg(dev, sg, nents, ents, dir, attrs); debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && ents != -EIO && ents != -EREMOTEIO)) { @@ -287,10 +305,13 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); + trace_dma_unmap_sg(dev, sg, nents, dir, attrs); debug_dma_unmap_sg(dev, sg, nents, dir); if (dma_map_direct(dev, ops) || arch_dma_unmap_sg_direct(dev, sg, nents)) dma_direct_unmap_sg(dev, sg, nents, dir, attrs); + else if (use_dma_iommu(dev)) + iommu_dma_unmap_sg(dev, sg, nents, dir, attrs); else if (ops->unmap_sg) ops->unmap_sg(dev, sg, nents, dir, attrs); } @@ -309,9 +330,12 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, if (dma_map_direct(dev, ops)) addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); + else if (use_dma_iommu(dev)) + addr = iommu_dma_map_resource(dev, phys_addr, size, dir, attrs); else if (ops->map_resource) addr = ops->map_resource(dev, phys_addr, size, dir, attrs); + trace_dma_map_resource(dev, phys_addr, addr, size, dir, attrs); debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs); return addr; } @@ -323,8 +347,13 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (!dma_map_direct(dev, ops) && ops->unmap_resource) + if (dma_map_direct(dev, ops)) + ; /* nothing to do: uncached and no swiotlb */ + else if (use_dma_iommu(dev)) + iommu_dma_unmap_resource(dev, addr, size, dir, attrs); + else if (ops->unmap_resource) ops->unmap_resource(dev, addr, size, dir, attrs); + trace_dma_unmap_resource(dev, addr, size, dir, attrs); debug_dma_unmap_resource(dev, addr, size, dir); } EXPORT_SYMBOL(dma_unmap_resource); @@ -338,8 +367,11 @@ void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); + else if (use_dma_iommu(dev)) + iommu_dma_sync_single_for_cpu(dev, addr, size, dir); else if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr, size, dir); + trace_dma_sync_single_for_cpu(dev, addr, size, dir); debug_dma_sync_single_for_cpu(dev, addr, size, dir); } EXPORT_SYMBOL(__dma_sync_single_for_cpu); @@ -352,8 +384,11 @@ void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr, BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_single_for_device(dev, addr, size, dir); + else if (use_dma_iommu(dev)) + iommu_dma_sync_single_for_device(dev, addr, size, dir); else if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr, size, dir); + trace_dma_sync_single_for_device(dev, addr, size, dir); debug_dma_sync_single_for_device(dev, addr, size, dir); } EXPORT_SYMBOL(__dma_sync_single_for_device); @@ -366,8 +401,11 @@ void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_sg_for_cpu(dev, sg, nelems, dir); + else if (use_dma_iommu(dev)) + iommu_dma_sync_sg_for_cpu(dev, sg, nelems, dir); else if (ops->sync_sg_for_cpu) ops->sync_sg_for_cpu(dev, sg, nelems, dir); + trace_dma_sync_sg_for_cpu(dev, sg, nelems, dir); debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); } EXPORT_SYMBOL(__dma_sync_sg_for_cpu); @@ -380,8 +418,11 @@ void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops)) dma_direct_sync_sg_for_device(dev, sg, nelems, dir); + else if (use_dma_iommu(dev)) + iommu_dma_sync_sg_for_device(dev, sg, nelems, dir); else if (ops->sync_sg_for_device) ops->sync_sg_for_device(dev, sg, nelems, dir); + trace_dma_sync_sg_for_device(dev, sg, nelems, dir); debug_dma_sync_sg_for_device(dev, sg, nelems, dir); } EXPORT_SYMBOL(__dma_sync_sg_for_device); @@ -405,7 +446,7 @@ static void dma_setup_need_sync(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_map_direct(dev, ops) || (ops->flags & DMA_F_CAN_SKIP_SYNC)) + if (dma_map_direct(dev, ops) || use_dma_iommu(dev)) /* * dma_skip_sync will be reset to %false on first SWIOTLB buffer * mapping, if any. During the device initialization, it's @@ -446,6 +487,9 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, if (dma_alloc_direct(dev, ops)) return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); + if (use_dma_iommu(dev)) + return iommu_dma_get_sgtable(dev, sgt, cpu_addr, dma_addr, + size, attrs); if (!ops->get_sgtable) return -ENXIO; return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); @@ -482,6 +526,8 @@ bool dma_can_mmap(struct device *dev) if (dma_alloc_direct(dev, ops)) return dma_direct_can_mmap(dev); + if (use_dma_iommu(dev)) + return true; return ops->mmap != NULL; } EXPORT_SYMBOL_GPL(dma_can_mmap); @@ -508,6 +554,9 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, if (dma_alloc_direct(dev, ops)) return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); + if (use_dma_iommu(dev)) + return iommu_dma_mmap(dev, vma, cpu_addr, dma_addr, size, + attrs); if (!ops->mmap) return -ENXIO; return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); @@ -559,11 +608,14 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, if (dma_alloc_direct(dev, ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); + else if (use_dma_iommu(dev)) + cpu_addr = iommu_dma_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); else return NULL; + trace_dma_alloc(dev, cpu_addr, *dma_handle, size, flag, attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs); return cpu_addr; } @@ -588,9 +640,12 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, if (!cpu_addr) return; + trace_dma_free(dev, cpu_addr, dma_handle, size, attrs); debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); if (dma_alloc_direct(dev, ops)) dma_direct_free(dev, size, cpu_addr, dma_handle, attrs); + else if (use_dma_iommu(dev)) + iommu_dma_free(dev, size, cpu_addr, dma_handle, attrs); else if (ops->free) ops->free(dev, size, cpu_addr, dma_handle, attrs); } @@ -611,6 +666,8 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size, size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); + if (use_dma_iommu(dev)) + return dma_common_alloc_pages(dev, size, dma_handle, dir, gfp); if (!ops->alloc_pages_op) return NULL; return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp); @@ -621,8 +678,11 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, { struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); - if (page) + if (page) { + trace_dma_map_page(dev, page_to_phys(page), *dma_handle, size, + dir, 0); debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); + } return page; } EXPORT_SYMBOL_GPL(dma_alloc_pages); @@ -635,6 +695,8 @@ static void __dma_free_pages(struct device *dev, size_t size, struct page *page, size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) dma_direct_free_pages(dev, size, page, dma_handle, dir); + else if (use_dma_iommu(dev)) + dma_common_free_pages(dev, size, page, dma_handle, dir); else if (ops->free_pages) ops->free_pages(dev, size, page, dma_handle, dir); } @@ -642,6 +704,7 @@ static void __dma_free_pages(struct device *dev, size_t size, struct page *page, void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { + trace_dma_unmap_page(dev, dma_handle, size, dir, 0); debug_dma_unmap_page(dev, dma_handle, size, dir); __dma_free_pages(dev, size, page, dma_handle, dir); } @@ -697,11 +760,14 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, if (ops && ops->alloc_noncontiguous) sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs); + else if (use_dma_iommu(dev)) + sgt = iommu_dma_alloc_noncontiguous(dev, size, dir, gfp, attrs); else sgt = alloc_single_sgt(dev, size, dir, gfp); if (sgt) { sgt->nents = 1; + trace_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); } return sgt; @@ -722,9 +788,12 @@ void dma_free_noncontiguous(struct device *dev, size_t size, { const struct dma_map_ops *ops = get_dma_ops(dev); + trace_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir, 0); debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir); if (ops && ops->free_noncontiguous) ops->free_noncontiguous(dev, size, sgt, dir); + else if (use_dma_iommu(dev)) + iommu_dma_free_noncontiguous(dev, size, sgt, dir); else free_single_sgt(dev, size, sgt, dir); } @@ -772,32 +841,37 @@ static int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); + if (use_dma_iommu(dev)) { + if (WARN_ON(ops)) + return false; + return true; + } + /* - * ->dma_supported sets the bypass flag, so we must always call - * into the method here unless the device is truly direct mapped. + * ->dma_supported sets and clears the bypass flag, so ignore it here + * and always call into the method if there is one. */ - if (!ops) - return dma_direct_supported(dev, mask); - if (!ops->dma_supported) - return 1; - return ops->dma_supported(dev, mask); + if (ops) { + if (!ops->dma_supported) + return true; + return ops->dma_supported(dev, mask); + } + + return dma_direct_supported(dev, mask); } bool dma_pci_p2pdma_supported(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - /* if ops is not set, dma direct will be used which supports P2PDMA */ - if (!ops) - return true; - /* * Note: dma_ops_bypass is not checked here because P2PDMA should * not be used with dma mapping ops that do not have support even * if the specific device is bypassing them. */ - return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED; + /* if ops is not set, dma direct and default IOMMU support P2PDMA */ + return !ops; } EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported); @@ -865,6 +939,8 @@ size_t dma_max_mapping_size(struct device *dev) if (dma_map_direct(dev, ops)) size = dma_direct_max_mapping_size(dev); + else if (use_dma_iommu(dev)) + size = iommu_dma_max_mapping_size(dev); else if (ops && ops->max_mapping_size) size = ops->max_mapping_size(dev); @@ -877,7 +953,9 @@ size_t dma_opt_mapping_size(struct device *dev) const struct dma_map_ops *ops = get_dma_ops(dev); size_t size = SIZE_MAX; - if (ops && ops->opt_mapping_size) + if (use_dma_iommu(dev)) + size = iommu_dma_opt_mapping_size(); + else if (ops && ops->opt_mapping_size) size = ops->opt_mapping_size(); return min(dma_max_mapping_size(dev), size); @@ -888,6 +966,9 @@ unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); + if (use_dma_iommu(dev)) + return iommu_dma_get_merge_boundary(dev); + if (!ops || !ops->get_merge_boundary) return 0; /* can't merge */ diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index af4a6ef48ce0..9afd569eadb9 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -4,6 +4,7 @@ * the allocated memory contains normal pages in the direct kernel mapping. */ #include <linux/dma-map-ops.h> +#include <linux/iommu-dma.h> static struct page *dma_common_vaddr_to_page(void *cpu_addr) { @@ -70,8 +71,12 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; - *dma_handle = ops->map_page(dev, page, 0, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); + if (use_dma_iommu(dev)) + *dma_handle = iommu_dma_map_page(dev, page, 0, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + else + *dma_handle = ops->map_page(dev, page, 0, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); if (*dma_handle == DMA_MAPPING_ERROR) { dma_free_contiguous(dev, page, size); return NULL; @@ -86,7 +91,10 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, { const struct dma_map_ops *ops = get_dma_ops(dev); - if (ops->unmap_page) + if (use_dma_iommu(dev)) + iommu_dma_unmap_page(dev, dma_handle, size, dir, + DMA_ATTR_SKIP_CPU_SYNC); + else if (ops->unmap_page) ops->unmap_page(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); dma_free_contiguous(dev, page, size); diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index d10613eb0f63..7b04f7575796 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -70,9 +70,9 @@ static bool cma_in_zone(gfp_t gfp) /* CMA can't cross zone boundaries, see cma_activate_area() */ end = cma_get_base(cma) + size - 1; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) - return end <= DMA_BIT_MASK(zone_dma_bits); + return end <= zone_dma_limit; if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) - return end <= DMA_BIT_MASK(32); + return end <= max(DMA_BIT_MASK(32), zone_dma_limit); return true; } diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 27596f3b4aef..9e2afad1c615 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -10,8 +10,10 @@ struct page **dma_common_find_pages(void *cpu_addr) { struct vm_struct *area = find_vm_area(cpu_addr); - if (!area || area->flags != VM_DMA_COHERENT) + if (!area || !(area->flags & VM_DMA_COHERENT)) return NULL; + WARN(area->flags != VM_DMA_COHERENT, + "unexpected flags in area: %p\n", cpu_addr); return area->pages; } @@ -61,7 +63,7 @@ void dma_common_free_remap(void *cpu_addr, size_t size) { struct vm_struct *area = find_vm_area(cpu_addr); - if (!area || area->flags != VM_DMA_COHERENT) { + if (!area || !(area->flags & VM_DMA_COHERENT)) { WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); return; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index df68d29740a0..abcf3fa63a56 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -450,9 +450,9 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask, if (!remap) io_tlb_default_mem.can_grow = true; if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp_mask & __GFP_DMA)) - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(zone_dma_bits); + io_tlb_default_mem.phys_limit = zone_dma_limit; else if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp_mask & __GFP_DMA32)) - io_tlb_default_mem.phys_limit = DMA_BIT_MASK(32); + io_tlb_default_mem.phys_limit = max(DMA_BIT_MASK(32), zone_dma_limit); else io_tlb_default_mem.phys_limit = virt_to_phys(high_memory - 1); #endif @@ -629,7 +629,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes, } gfp &= ~GFP_ZONEMASK; - if (phys_limit <= DMA_BIT_MASK(zone_dma_bits)) + if (phys_limit <= zone_dma_limit) gfp |= __GFP_DMA; else if (phys_limit <= DMA_BIT_MASK(32)) gfp |= __GFP_DMA32; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 90843cc38588..5b6934e23c21 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -182,7 +182,7 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) unsigned long work = READ_ONCE(current_thread_info()->syscall_work); unsigned long nr = syscall_get_nr(current, regs); - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + CT_WARN_ON(ct_state() != CT_STATE_KERNEL); if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr)) diff --git a/kernel/events/core.c b/kernel/events/core.c index c973e3c11e03..4f03eb908e7f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } +enum event_type_t { + EVENT_FLEXIBLE = 0x01, + EVENT_PINNED = 0x02, + EVENT_TIME = 0x04, + EVENT_FROZEN = 0x08, + /* see ctx_resched() for details */ + EVENT_CPU = 0x10, + EVENT_CGROUP = 0x20, + + /* compound helpers */ + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, + EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, +}; + +static inline void __perf_ctx_lock(struct perf_event_context *ctx) +{ + raw_spin_lock(&ctx->lock); + WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); +} + static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - raw_spin_lock(&cpuctx->ctx.lock); + __perf_ctx_lock(&cpuctx->ctx); if (ctx) - raw_spin_lock(&ctx->lock); + __perf_ctx_lock(ctx); +} + +static inline void __perf_ctx_unlock(struct perf_event_context *ctx) +{ + /* + * If ctx_sched_in() didn't again set any ALL flags, clean up + * after ctx_sched_out() by clearing is_active. + */ + if (ctx->is_active & EVENT_FROZEN) { + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + else + ctx->is_active &= ~EVENT_FROZEN; + } + raw_spin_unlock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); + __perf_ctx_unlock(ctx); + __perf_ctx_unlock(&cpuctx->ctx); } #define TASK_TOMBSTONE ((void *)-1L) @@ -264,6 +299,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, @@ -291,22 +327,25 @@ again: if (!task_function_call(task, event_function, &efs)) return; - raw_spin_lock_irq(&ctx->lock); + local_irq_disable(); + cpuctx = this_cpu_ptr(&perf_cpu_context); + perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; - if (task == TASK_TOMBSTONE) { - raw_spin_unlock_irq(&ctx->lock); - return; - } + if (task == TASK_TOMBSTONE) + goto unlock; if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); goto again; } func(event, NULL, ctx, data); - raw_spin_unlock_irq(&ctx->lock); +unlock: + perf_ctx_unlock(cpuctx, ctx); + local_irq_enable(); } /* @@ -369,16 +408,6 @@ unlock: (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_TIME = 0x4, - /* see ctx_resched() for details */ - EVENT_CPU = 0x8, - EVENT_CGROUP = 0x10, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - /* * perf_sched_events : >0 events exist */ @@ -407,6 +436,11 @@ static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; +static cpumask_var_t perf_online_core_mask; +static cpumask_var_t perf_online_die_mask; +static cpumask_var_t perf_online_cluster_mask; +static cpumask_var_t perf_online_pkg_mask; +static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; /* @@ -685,30 +719,32 @@ do { \ ___p; \ }) +#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ + list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ + if (_cgroup && !_epc->nr_cgroups) \ + continue; \ + else if (_pmu && _epc->pmu != _pmu) \ + continue; \ + else + static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_disable(pmu_ctx->pmu); - } } static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_enable(pmu_ctx->pmu); - } } -static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); -static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); +static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF @@ -865,7 +901,7 @@ static void perf_cgroup_switch(struct task_struct *task) perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx, true); - ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in @@ -877,7 +913,7 @@ static void perf_cgroup_switch(struct task_struct *task) * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ - ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); + ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -1255,8 +1291,9 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock - * perf_event::mmap_mutex * mmap_lock + * perf_event::mmap_mutex + * perf_buffer::aux_mutex * perf_addr_filters_head::lock * * cpu_hotplug_lock @@ -1768,6 +1805,14 @@ perf_event_groups_next(struct perf_event *event, struct pmu *pmu) typeof(*event), group_node)) /* + * Does the event attribute request inherit with PERF_SAMPLE_READ + */ +static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) +{ + return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); +} + +/* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ @@ -1797,6 +1842,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; + if (has_inherit_and_sample_read(&event->attr)) + local_inc(&ctx->nr_no_switch_fast); if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); @@ -2021,6 +2068,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; + if (has_inherit_and_sample_read(&event->attr)) + local_dec(&ctx->nr_no_switch_fast); list_del_rcu(&event->event_entry); @@ -2316,6 +2365,45 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) event_sched_out(event, ctx); } +static inline void +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +{ + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, final); + } +} + +static inline void +ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + __ctx_time_update(cpuctx, ctx, false); +} + +/* + * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). + */ +static inline void +ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) +{ + ctx_time_update(cpuctx, ctx); + if (ctx->is_active & EVENT_TIME) + ctx->is_active |= EVENT_FROZEN; +} + +static inline void +ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) +{ + if (ctx->is_active & EVENT_TIME) { + if (ctx->is_active & EVENT_FROZEN) + return; + update_context_time(ctx); + update_cgrp_time_from_event(event); + } +} + #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL @@ -2335,10 +2423,7 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, false); - } + ctx_time_update(cpuctx, ctx); /* * Ensure event_sched_out() switches to OFF, at the very least @@ -2423,12 +2508,8 @@ static void __perf_event_disable(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return; - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } - perf_pmu_disable(event->pmu_ctx->pmu); + ctx_time_update_event(ctx, event); if (event == event->group_leader) group_sched_out(event, ctx); @@ -2644,7 +2725,8 @@ static void add_event_to_ctx(struct perf_event *event, } static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) + struct pmu *pmu, + enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); @@ -2654,18 +2736,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, event_type); + ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) + struct perf_event_context *ctx, + struct pmu *pmu) { - ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* @@ -2683,16 +2766,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ -/* - * XXX: ctx_resched() reschedule entire perf_event_context while adding new - * event to the context or enabling existing event in the context. We can - * probably optimize it by rescheduling only affected pmu_ctx. - */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, - enum event_type_t event_type) + struct pmu *pmu, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); + struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be @@ -2703,10 +2782,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - perf_ctx_disable(&cpuctx->ctx, false); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_disable(epc->pmu); + if (task_ctx) { - perf_ctx_disable(task_ctx, false); - task_ctx_sched_out(task_ctx, event_type); + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_disable(epc->pmu); + + task_ctx_sched_out(task_ctx, pmu, event_type); } /* @@ -2717,15 +2800,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - ctx_sched_out(&cpuctx->ctx, event_type); + ctx_sched_out(&cpuctx->ctx, pmu, event_type); else if (event_type & EVENT_PINNED) - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx); + perf_event_sched_in(cpuctx, task_ctx, pmu); - perf_ctx_enable(&cpuctx->ctx, false); - if (task_ctx) - perf_ctx_enable(task_ctx, false); + for_each_epc(epc, &cpuctx->ctx, pmu, false) + perf_pmu_enable(epc->pmu); + + if (task_ctx) { + for_each_epc(epc, task_ctx, pmu, false) + perf_pmu_enable(epc->pmu); + } } void perf_pmu_resched(struct pmu *pmu) @@ -2734,7 +2821,7 @@ void perf_pmu_resched(struct pmu *pmu) struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); - ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); + ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } @@ -2790,9 +2877,10 @@ static int __perf_install_in_context(void *info) #endif if (reprogram) { - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, + get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2935,8 +3023,7 @@ static void __perf_event_enable(struct perf_event *event, event->state <= PERF_EVENT_STATE_ERROR) return; - if (ctx->is_active) - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@ -2944,25 +3031,21 @@ static void __perf_event_enable(struct perf_event *event, if (!ctx->is_active) return; - if (!event_filter_match(event)) { - ctx_sched_in(ctx, EVENT_TIME); + if (!event_filter_match(event)) return; - } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, EVENT_TIME); + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; - } task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx, get_event_type(event)); + ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* @@ -3230,7 +3313,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, struct perf_event *event, *tmp; struct pmu *pmu = pmu_ctx->pmu; - if (ctx->task && !ctx->is_active) { + if (ctx->task && !(ctx->is_active & EVENT_ALL)) { struct perf_cpu_pmu_context *cpc; cpc = this_cpu_ptr(pmu->cpu_pmu_context); @@ -3238,7 +3321,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, cpc->task_epc = NULL; } - if (!event_type) + if (!(event_type & EVENT_ALL)) return; perf_pmu_disable(pmu); @@ -3264,8 +3347,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, perf_pmu_enable(pmu); } +/* + * Be very careful with the @pmu argument since this will change ctx state. + * The @pmu argument works for ctx_resched(), because that is symmetric in + * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. + * + * However, if you were to be asymmetrical, you could end up with messed up + * state, eg. ctx->is_active cleared even though most EPCs would still actually + * be active. + */ static void -ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) +ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; @@ -3296,34 +3388,36 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) * * would only update time for the pinned events. */ - if (is_active & EVENT_TIME) { - /* update (and stop) ctx time */ - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + ctx->is_active &= ~event_type; + + if (!(ctx->is_active & EVENT_ALL)) { /* - * CPU-release for the below ->is_active store, - * see __load_acquire() in perf_event_time_now() + * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() + * does not observe a hole. perf_ctx_unlock() will clean up. */ - barrier(); + if (ctx->is_active & EVENT_FROZEN) + ctx->is_active &= EVENT_TIME_FROZEN; + else + ctx->is_active = 0; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) + if (!(ctx->is_active & EVENT_ALL)) cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; + for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_out(pmu_ctx, is_active); - } } /* @@ -3516,12 +3610,17 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_disable(ctx, false); - /* PMIs are disabled; ctx->nr_pending is stable. */ - if (local_read(&ctx->nr_pending) || - local_read(&next_ctx->nr_pending)) { + /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ + if (local_read(&ctx->nr_no_switch_fast) || + local_read(&next_ctx->nr_no_switch_fast)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. + * + * Likewise, when a context contains inherit + + * SAMPLE_READ events they should be switched + * out using the slow path so that they are + * treated as if they were distinct contexts. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); @@ -3562,7 +3661,7 @@ unlock: inside_switch: perf_ctx_sched_task_cb(ctx, false); - task_ctx_sched_out(ctx, EVENT_ALL); + task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); @@ -3860,29 +3959,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx, merge_sched_in, &can_add_hw); } -static void ctx_groups_sched_in(struct perf_event_context *ctx, - struct perf_event_groups *groups, - bool cgroup) +static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { - struct perf_event_pmu_context *pmu_ctx; - - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - if (cgroup && !pmu_ctx->nr_cgroups) - continue; - pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); - } -} + struct perf_event_context *ctx = pmu_ctx->ctx; -static void __pmu_ctx_sched_in(struct perf_event_context *ctx, - struct pmu *pmu) -{ - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); + if (event_type & EVENT_PINNED) + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + if (event_type & EVENT_FLEXIBLE) + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); } static void -ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) +ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; @@ -3906,7 +3998,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { - if (!is_active) + if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); @@ -3918,12 +4010,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (is_active & EVENT_PINNED) - ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); + if (is_active & EVENT_PINNED) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + } /* Then walk through the lower prio flexible groups */ - if (is_active & EVENT_FLEXIBLE) - ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); + if (is_active & EVENT_FLEXIBLE) { + for_each_epc(pmu_ctx, ctx, pmu, cgroup) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + } } static void perf_event_context_sched_in(struct task_struct *task) @@ -3966,10 +4062,10 @@ static void perf_event_context_sched_in(struct task_struct *task) */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx, false); - ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx); + perf_event_sched_in(cpuctx, ctx, NULL); perf_ctx_sched_task_cb(cpuctx->task_ctx, true); @@ -4092,7 +4188,11 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ + if (delta >= 0) + delta += 7; + else + delta -= 7; + delta /= 8; /* low pass filter */ sample_period = hwc->sample_period + delta; @@ -4310,14 +4410,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) update_context_time(&cpuctx->ctx); __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); - __pmu_ctx_sched_in(&cpuctx->ctx, pmu); + __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE); } if (task_event) rotate_ctx(task_epc->ctx, task_event); if (task_event || (task_epc && cpu_event)) - __pmu_ctx_sched_in(task_epc->ctx, pmu); + __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4383,7 +4483,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, EVENT_TIME); + ctx_time_freeze(cpuctx, ctx); list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); @@ -4395,9 +4495,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx, event_type); - } else { - ctx_sched_in(ctx, EVENT_TIME); + ctx_resched(cpuctx, ctx, NULL, event_type); } perf_ctx_unlock(cpuctx, ctx); @@ -4458,16 +4556,24 @@ struct perf_read_data { int ret; }; +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu); + static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { + int local_cpu = smp_processor_id(); u16 local_pkg, event_pkg; if ((unsigned)event_cpu >= nr_cpu_ids) return event_cpu; - if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { - int local_cpu = smp_processor_id(); + if (event->group_caps & PERF_EV_CAP_READ_SCOPE) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu); + + if (cpumask && cpumask_test_cpu(local_cpu, cpumask)) + return local_cpu; + } + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { event_pkg = topology_physical_package_id(event_cpu); local_pkg = topology_physical_package_id(local_cpu); @@ -4500,10 +4606,7 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); perf_event_update_time(event); if (data->group) @@ -4538,8 +4641,11 @@ unlock: raw_spin_unlock(&ctx->lock); } -static inline u64 perf_event_count(struct perf_event *event) +static inline u64 perf_event_count(struct perf_event *event, bool self) { + if (self) + return local64_read(&event->count); + return local64_read(&event->count) + atomic64_read(&event->child_count); } @@ -4700,10 +4806,7 @@ again: * May read while context is not active (e.g., thread is * blocked), in that case we cannot update context time */ - if (ctx->is_active & EVENT_TIME) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - } + ctx_time_update_event(ctx, event); perf_event_update_time(event); if (group) @@ -5204,7 +5307,7 @@ static void perf_pending_task_sync(struct perf_event *event) */ if (task_work_cancel(current, head)) { event->pending_work = 0; - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); return; } @@ -5498,7 +5601,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 * mutex_lock(&event->child_mutex); (void)perf_event_read(event, false); - total += perf_event_count(event); + total += perf_event_count(event, false); *enabled += event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -5507,7 +5610,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 * list_for_each_entry(child, &event->child_list, child_list) { (void)perf_event_read(child, false); - total += perf_event_count(child); + total += perf_event_count(child, false); *enabled += child->total_time_enabled; *running += child->total_time_running; } @@ -5589,14 +5692,14 @@ static int __perf_read_group_add(struct perf_event *leader, /* * Write {count,id} tuples for every sibling. */ - values[n++] += perf_event_count(leader); + values[n++] += perf_event_count(leader, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) values[n++] = atomic64_read(&leader->lost_samples); for_each_sibling_event(sub, leader) { - values[n++] += perf_event_count(sub); + values[n++] += perf_event_count(sub, false); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@ -6176,7 +6279,7 @@ void perf_event_update_userpage(struct perf_event *event) ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = perf_event_count(event); + userpg->offset = perf_event_count(event, false); if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); @@ -6373,12 +6476,11 @@ static void perf_mmap_close(struct vm_area_struct *vma) event->pmu->event_unmapped(event, vma->vm_mm); /* - * rb->aux_mmap_count will always drop before rb->mmap_count and - * event->mmap_count, so it is ok to use event->mmap_mutex to - * serialize with perf_mmap here. + * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex + * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6395,7 +6497,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) rb_free_aux(rb); WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); - mutex_unlock(&event->mmap_mutex); + mutex_unlock(&rb->aux_mutex); } if (atomic_dec_and_test(&rb->mmap_count)) @@ -6483,6 +6585,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + struct mutex *aux_mutex = NULL; struct perf_buffer *rb = NULL; unsigned long locked, lock_limit; unsigned long vma_size; @@ -6531,6 +6634,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (!rb) goto aux_unlock; + aux_mutex = &rb->aux_mutex; + mutex_lock(aux_mutex); + aux_offset = READ_ONCE(rb->user_page->aux_offset); aux_size = READ_ONCE(rb->user_page->aux_size); @@ -6681,6 +6787,8 @@ unlock: atomic_dec(&rb->mmap_count); } aux_unlock: + if (aux_mutex) + mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); /* @@ -6868,7 +6976,7 @@ static void perf_pending_task(struct callback_head *head) if (event->pending_work) { event->pending_work = 0; perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); + local_dec(&event->ctx->nr_no_switch_fast); rcuwait_wake_up(&event->pending_work_wait); } rcu_read_unlock(); @@ -7250,7 +7358,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 values[5]; int n = 0; - values[n++] = perf_event_count(event); + values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr)); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); @@ -7268,14 +7376,15 @@ static void perf_output_read_one(struct perf_output_handle *handle, } static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event, - u64 enabled, u64 running) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; unsigned long flags; u64 values[6]; int n = 0; + bool self = has_inherit_and_sample_read(&event->attr); /* * Disabling interrupts avoids all counter scheduling @@ -7295,7 +7404,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, (leader->state == PERF_EVENT_STATE_ACTIVE)) leader->pmu->read(leader); - values[n++] = perf_event_count(leader); + values[n++] = perf_event_count(leader, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); if (read_format & PERF_FORMAT_LOST) @@ -7310,7 +7419,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); - values[n++] = perf_event_count(sub); + values[n++] = perf_event_count(sub, self); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); if (read_format & PERF_FORMAT_LOST) @@ -7331,6 +7440,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, * The problem is that its both hard and excessively expensive to iterate the * child list, not to mention that its impossible to IPI the children running * on another CPU, from interrupt/NMI context. + * + * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread + * counts rather than attempting to accumulate some value across all children on + * all cores. */ static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) @@ -9741,7 +9854,7 @@ static int __perf_event_overflow(struct perf_event *event, if (!event->pending_work && !task_work_add(current, &event->pending_task, notify_mode)) { event->pending_work = pending_id; - local_inc(&event->ctx->nr_pending); + local_inc(&event->ctx->nr_no_switch_fast); event->pending_addr = 0; if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) @@ -11478,10 +11591,60 @@ perf_event_mux_interval_ms_store(struct device *dev, } static DEVICE_ATTR_RW(perf_event_mux_interval_ms); +static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return topology_sibling_cpumask(cpu); + case PERF_PMU_SCOPE_DIE: + return topology_die_cpumask(cpu); + case PERF_PMU_SCOPE_CLUSTER: + return topology_cluster_cpumask(cpu); + case PERF_PMU_SCOPE_PKG: + return topology_core_cpumask(cpu); + case PERF_PMU_SCOPE_SYS_WIDE: + return cpu_online_mask; + } + + return NULL; +} + +static inline struct cpumask *perf_scope_cpumask(unsigned int scope) +{ + switch (scope) { + case PERF_PMU_SCOPE_CORE: + return perf_online_core_mask; + case PERF_PMU_SCOPE_DIE: + return perf_online_die_mask; + case PERF_PMU_SCOPE_CLUSTER: + return perf_online_cluster_mask; + case PERF_PMU_SCOPE_PKG: + return perf_online_pkg_mask; + case PERF_PMU_SCOPE_SYS_WIDE: + return perf_online_sys_mask; + } + + return NULL; +} + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct cpumask *mask = perf_scope_cpumask(pmu->scope); + + if (mask) + return cpumap_print_to_pagebuf(true, buf, mask); + return 0; +} + +static DEVICE_ATTR_RO(cpumask); + static struct attribute *pmu_dev_attrs[] = { &dev_attr_type.attr, &dev_attr_perf_event_mux_interval_ms.attr, &dev_attr_nr_addr_filters.attr, + &dev_attr_cpumask.attr, NULL, }; @@ -11493,6 +11656,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int if (n == 2 && !pmu->nr_addr_filters) return 0; + /* cpumask */ + if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE) + return 0; + return a->mode; } @@ -11577,6 +11744,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) goto free_pdc; } + if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) { + ret = -EINVAL; + goto free_pdc; + } + pmu->name = name; if (type >= 0) @@ -11731,6 +11903,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) event_has_any_exclude_flag(event)) ret = -EINVAL; + if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope); + int cpu; + + if (pmu_cpumask && cpumask) { + cpu = cpumask_any_and(pmu_cpumask, cpumask); + if (cpu >= nr_cpu_ids) + ret = -ENODEV; + else + event->event_caps |= PERF_EV_CAP_READ_SCOPE; + } else { + ret = -ENODEV; + } + } + if (ret && event->destroy) event->destroy(event); } @@ -12058,10 +12246,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, local64_set(&hwc->period_left, hwc->sample_period); /* - * We currently do not support PERF_SAMPLE_READ on inherited events. + * We do not support PERF_SAMPLE_READ on inherited events unless + * PERF_SAMPLE_TID is also selected, which allows inherited events to + * collect per-thread samples. * See perf_output_read(). */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)) + if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID)) goto err_ns; if (!has_branch_stack(event)) @@ -13085,7 +13275,7 @@ static void sync_child_event(struct perf_event *child_event) perf_event_read_event(child_event, task); } - child_val = perf_event_count(child_event); + child_val = perf_event_count(child_event, false); /* * Add back the child's count to the parent's count: @@ -13176,7 +13366,7 @@ static void perf_event_exit_task_context(struct task_struct *child) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx, NULL, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation @@ -13352,6 +13542,15 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event) return &event->attr; } +int perf_allow_kernel(struct perf_event_attr *attr) +{ + if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) + return -EACCES; + + return security_perf_event_open(attr, PERF_SECURITY_KERNEL); +} +EXPORT_SYMBOL_GPL(perf_allow_kernel); + /* * Inherit an event from parent task to child task. * @@ -13682,6 +13881,12 @@ static void __init perf_event_init_all_cpus(void) int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL); + zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL); + for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); @@ -13725,12 +13930,46 @@ static void __perf_event_exit_context(void *__info) struct perf_event *event; raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, EVENT_TIME); + ctx_sched_out(ctx, NULL, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); } +static void perf_event_clear_cpumask(unsigned int cpu) +{ + int target[PERF_PMU_MAX_SCOPE]; + unsigned int scope; + struct pmu *pmu; + + cpumask_clear_cpu(cpu, perf_online_mask); + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + struct cpumask *pmu_cpumask = perf_scope_cpumask(scope); + + target[scope] = -1; + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask)) + continue; + target[scope] = cpumask_any_but(cpumask, cpu); + if (target[scope] < nr_cpu_ids) + cpumask_set_cpu(target[scope], pmu_cpumask); + } + + /* migrate */ + list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { + if (pmu->scope == PERF_PMU_SCOPE_NONE || + WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE)) + continue; + + if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids) + perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]); + } +} + static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; @@ -13738,6 +13977,11 @@ static void perf_event_exit_cpu_context(int cpu) // XXX simplify cpuctx->online mutex_lock(&pmus_lock); + /* + * Clear the cpumasks, and migrate to other CPUs if possible. + * Must be invoked before the __perf_event_exit_context. + */ + perf_event_clear_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; @@ -13745,7 +13989,6 @@ static void perf_event_exit_cpu_context(int cpu) smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); cpuctx->online = 0; mutex_unlock(&ctx->mutex); - cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } #else @@ -13754,6 +13997,42 @@ static void perf_event_exit_cpu_context(int cpu) { } #endif +static void perf_event_setup_cpumask(unsigned int cpu) +{ + struct cpumask *pmu_cpumask; + unsigned int scope; + + cpumask_set_cpu(cpu, perf_online_mask); + + /* + * Early boot stage, the cpumask hasn't been set yet. + * The perf_online_<domain>_masks includes the first CPU of each domain. + * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks. + */ + if (!topology_sibling_cpumask(cpu)) { + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + pmu_cpumask = perf_scope_cpumask(scope); + if (WARN_ON_ONCE(!pmu_cpumask)) + continue; + cpumask_set_cpu(cpu, pmu_cpumask); + } + return; + } + + for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) { + const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu); + + pmu_cpumask = perf_scope_cpumask(scope); + + if (WARN_ON_ONCE(!pmu_cpumask || !cpumask)) + continue; + + if (!cpumask_empty(cpumask) && + cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids) + cpumask_set_cpu(cpu, pmu_cpumask); + } +} + int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; @@ -13762,7 +14041,7 @@ int perf_event_init_cpu(unsigned int cpu) perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); - cpumask_set_cpu(cpu, perf_online_mask); + perf_event_setup_cpumask(cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); ctx = &cpuctx->ctx; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 451514442a1b..e072d995d670 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -40,6 +40,7 @@ struct perf_buffer { struct user_struct *mmap_user; /* AUX area */ + struct mutex aux_mutex; long aux_head; unsigned int aux_nest; long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 8cadf97bc290..4f46f688d0d4 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -337,6 +337,8 @@ ring_buffer_init(struct perf_buffer *rb, long watermark, int flags) */ if (!rb->nr_pages) rb->paused = 1; + + mutex_init(&rb->aux_mutex); } void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 73cc47708679..2ec796e2f055 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -40,6 +40,9 @@ static struct rb_root uprobes_tree = RB_ROOT; #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ +static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); + +DEFINE_STATIC_SRCU(uprobes_srcu); #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ @@ -57,8 +60,9 @@ struct uprobe { struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; - struct uprobe_consumer *consumers; + struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ + struct rcu_head rcu; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; @@ -99,8 +103,7 @@ struct xol_area { atomic_t slot_count; /* number of in-use slots */ unsigned long *bitmap; /* 0 = free slot */ - struct vm_special_mapping xol_mapping; - struct page *pages[2]; + struct page *page; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make @@ -109,6 +112,11 @@ struct xol_area { unsigned long vaddr; /* Page(s) of instruction slots */ }; +static void uprobe_warn(struct task_struct *t, const char *msg) +{ + pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); +} + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -453,7 +461,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * - * Called with mm->mmap_lock held for write. + * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, @@ -587,25 +595,63 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v *(uprobe_opcode_t *)&auprobe->insn); } +/* uprobe should have guaranteed positive refcount */ static struct uprobe *get_uprobe(struct uprobe *uprobe) { refcount_inc(&uprobe->ref); return uprobe; } +/* + * uprobe should have guaranteed lifetime, which can be either of: + * - caller already has refcount taken (and wants an extra one); + * - uprobe is RCU protected and won't be freed until after grace period; + * - we are holding uprobes_treelock (for read or write, doesn't matter). + */ +static struct uprobe *try_get_uprobe(struct uprobe *uprobe) +{ + if (refcount_inc_not_zero(&uprobe->ref)) + return uprobe; + return NULL; +} + +static inline bool uprobe_is_active(struct uprobe *uprobe) +{ + return !RB_EMPTY_NODE(&uprobe->rb_node); +} + +static void uprobe_free_rcu(struct rcu_head *rcu) +{ + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + kfree(uprobe); +} + static void put_uprobe(struct uprobe *uprobe) { - if (refcount_dec_and_test(&uprobe->ref)) { - /* - * If application munmap(exec_vma) before uprobe_unregister() - * gets called, we don't get a chance to remove uprobe from - * delayed_uprobe_list from remove_breakpoint(). Do it here. - */ - mutex_lock(&delayed_uprobe_lock); - delayed_uprobe_remove(uprobe, NULL); - mutex_unlock(&delayed_uprobe_lock); - kfree(uprobe); + if (!refcount_dec_and_test(&uprobe->ref)) + return; + + write_lock(&uprobes_treelock); + + if (uprobe_is_active(uprobe)) { + write_seqcount_begin(&uprobes_seqcount); + rb_erase(&uprobe->rb_node, &uprobes_tree); + write_seqcount_end(&uprobes_seqcount); } + + write_unlock(&uprobes_treelock); + + /* + * If application munmap(exec_vma) before uprobe_unregister() + * gets called, we don't get a chance to remove uprobe from + * delayed_uprobe_list from remove_breakpoint(). Do it here. + */ + mutex_lock(&delayed_uprobe_lock); + delayed_uprobe_remove(uprobe, NULL); + mutex_unlock(&delayed_uprobe_lock); + + call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); } static __always_inline @@ -647,62 +693,86 @@ static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); } -static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) +/* + * Assumes being inside RCU protected region. + * No refcount is taken on returned uprobe. + */ +static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { struct __uprobe_key key = { .inode = inode, .offset = offset, }; - struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); + struct rb_node *node; + unsigned int seq; + + lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); - if (node) - return get_uprobe(__node_2_uprobe(node)); + do { + seq = read_seqcount_begin(&uprobes_seqcount); + node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); + /* + * Lockless RB-tree lookups can result only in false negatives. + * If the element is found, it is correct and can be returned + * under RCU protection. If we find nothing, we need to + * validate that seqcount didn't change. If it did, we have to + * try again as we might have missed the element (false + * negative). If seqcount is unchanged, search truly failed. + */ + if (node) + return __node_2_uprobe(node); + } while (read_seqcount_retry(&uprobes_seqcount, seq)); return NULL; } /* - * Find a uprobe corresponding to a given inode:offset - * Acquires uprobes_treelock + * Attempt to insert a new uprobe into uprobes_tree. + * + * If uprobe already exists (for given inode+offset), we just increment + * refcount of previously existing uprobe. + * + * If not, a provided new instance of uprobe is inserted into the tree (with + * assumed initial refcount == 1). + * + * In any case, we return a uprobe instance that ends up being in uprobes_tree. + * Caller has to clean up new uprobe instance, if it ended up not being + * inserted into the tree. + * + * We assume that uprobes_treelock is held for writing. */ -static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) -{ - struct uprobe *uprobe; - - read_lock(&uprobes_treelock); - uprobe = __find_uprobe(inode, offset); - read_unlock(&uprobes_treelock); - - return uprobe; -} - static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { struct rb_node *node; +again: + node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); + if (node) { + struct uprobe *u = __node_2_uprobe(node); + + if (!try_get_uprobe(u)) { + rb_erase(node, &uprobes_tree); + RB_CLEAR_NODE(&u->rb_node); + goto again; + } - node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); - if (node) - return get_uprobe(__node_2_uprobe(node)); + return u; + } - /* get access + creation ref */ - refcount_set(&uprobe->ref, 2); - return NULL; + return uprobe; } /* - * Acquire uprobes_treelock. - * Matching uprobe already exists in rbtree; - * increment (access refcount) and return the matching uprobe. - * - * No matching uprobe; insert the uprobe in rb_tree; - * get a double refcount (access + creation) and return NULL. + * Acquire uprobes_treelock and insert uprobe into uprobes_tree + * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; write_lock(&uprobes_treelock); + write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); + write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock); return u; @@ -725,18 +795,21 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) - return NULL; + return ERR_PTR(-ENOMEM); uprobe->inode = inode; uprobe->offset = offset; uprobe->ref_ctr_offset = ref_ctr_offset; + INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); + RB_CLEAR_NODE(&uprobe->rb_node); + refcount_set(&uprobe->ref, 1); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ - if (cur_uprobe) { + if (cur_uprobe != uprobe) { if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { ref_ctr_mismatch_warn(cur_uprobe, uprobe); put_uprobe(cur_uprobe); @@ -753,32 +826,19 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); - uc->next = uprobe->consumers; - uprobe->consumers = uc; + list_add_rcu(&uc->cons_node, &uprobe->consumers); up_write(&uprobe->consumer_rwsem); } /* * For uprobe @uprobe, delete the consumer @uc. - * Return true if the @uc is deleted successfully - * or return false. + * Should never be called with consumer that's not part of @uprobe->consumers. */ -static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) +static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { - struct uprobe_consumer **con; - bool ret = false; - down_write(&uprobe->consumer_rwsem); - for (con = &uprobe->consumers; *con; con = &(*con)->next) { - if (*con == uc) { - *con = uc->next; - ret = true; - break; - } - } + list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); - - return ret; } static int __copy_insn(struct address_space *mapping, struct file *filp, @@ -863,21 +923,20 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, return ret; } -static inline bool consumer_filter(struct uprobe_consumer *uc, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { - return !uc->filter || uc->filter(uc, ctx, mm); + return !uc->filter || uc->filter(uc, mm); } -static bool filter_chain(struct uprobe *uprobe, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { - ret = consumer_filter(uc, ctx, mm); + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { + ret = consumer_filter(uc, mm); if (ret) break; } @@ -921,27 +980,6 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad return set_orig_insn(&uprobe->arch, mm, vaddr); } -static inline bool uprobe_is_active(struct uprobe *uprobe) -{ - return !RB_EMPTY_NODE(&uprobe->rb_node); -} -/* - * There could be threads that have already hit the breakpoint. They - * will recheck the current insn and restart if find_uprobe() fails. - * See find_active_uprobe(). - */ -static void delete_uprobe(struct uprobe *uprobe) -{ - if (WARN_ON(!uprobe_is_active(uprobe))) - return; - - write_lock(&uprobes_treelock); - rb_erase(&uprobe->rb_node, &uprobes_tree); - write_unlock(&uprobes_treelock); - RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ - put_uprobe(uprobe); -} - struct map_info { struct map_info *next; struct mm_struct *mm; @@ -1046,7 +1084,13 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (err && is_register) goto free; - + /* + * We take mmap_lock for writing to avoid the race with + * find_active_uprobe_rcu() which takes mmap_lock for reading. + * Thus this install_breakpoint() can not make + * is_trap_at_addr() true right after find_uprobe_rcu() + * returns NULL in find_active_uprobe_rcu(). + */ mmap_write_lock(mm); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || @@ -1059,12 +1103,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) if (is_register) { /* consult only the "caller", new consumer. */ - if (consumer_filter(new, - UPROBE_FILTER_REGISTER, mm)) + if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { - if (!filter_chain(uprobe, - UPROBE_FILTER_UNREGISTER, mm)) + if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, mm, info->vaddr); } @@ -1079,152 +1121,140 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) return err; } -static void -__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +/** + * uprobe_unregister_nosync - unregister an already registered probe. + * @uprobe: uprobe to remove + * @uc: identify which probe if multiple probes are colocated. + */ +void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; - if (WARN_ON(!consumer_del(uprobe, uc))) - return; - + down_write(&uprobe->register_rwsem); + consumer_del(uprobe, uc); err = register_for_each_vma(uprobe, NULL); - /* TODO : cant unregister? schedule a worker thread */ - if (!uprobe->consumers && !err) - delete_uprobe(uprobe); -} - -/* - * uprobe_unregister - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. - * @uc: identify which probe if multiple probes are colocated. - */ -void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - struct uprobe *uprobe; + up_write(&uprobe->register_rwsem); - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) + /* TODO : cant unregister? schedule a worker thread */ + if (unlikely(err)) { + uprobe_warn(current, "unregister, leaking uprobe"); return; + } - down_write(&uprobe->register_rwsem); - __uprobe_unregister(uprobe, uc); - up_write(&uprobe->register_rwsem); put_uprobe(uprobe); } -EXPORT_SYMBOL_GPL(uprobe_unregister); +EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); -/* - * __uprobe_register - register a probe +void uprobe_unregister_sync(void) +{ + /* + * Now that handler_chain() and handle_uretprobe_chain() iterate over + * uprobe->consumers list under RCU protection without holding + * uprobe->register_rwsem, we need to wait for RCU grace period to + * make sure that we can't call into just unregistered + * uprobe_consumer's callbacks anymore. If we don't do that, fast and + * unlucky enough caller can free consumer's memory and cause + * handler_chain() or handle_uretprobe_chain() to do an use-after-free. + */ + synchronize_srcu(&uprobes_srcu); +} +EXPORT_SYMBOL_GPL(uprobe_unregister_sync); + +/** + * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. + * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * - * Apart from the access refcount, __uprobe_register() takes a creation + * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe - * unregisters. Caller of __uprobe_register() is required to keep @inode + * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * - * Return errno if it cannot successully install probes - * else return 0 (success) + * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ -static int __uprobe_register(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) +struct uprobe *uprobe_register(struct inode *inode, + loff_t offset, loff_t ref_ctr_offset, + struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) - return -EINVAL; + return ERR_PTR(-EINVAL); /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && !shmem_mapping(inode->i_mapping)) - return -EIO; + return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) - return -EINVAL; + return ERR_PTR(-EINVAL); /* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) - return -EINVAL; + return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) - return -EINVAL; + return ERR_PTR(-EINVAL); - retry: uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); - if (!uprobe) - return -ENOMEM; if (IS_ERR(uprobe)) - return PTR_ERR(uprobe); + return uprobe; - /* - * We can race with uprobe_unregister()->delete_uprobe(). - * Check uprobe_is_active() and retry if it is false. - */ down_write(&uprobe->register_rwsem); - ret = -EAGAIN; - if (likely(uprobe_is_active(uprobe))) { - consumer_add(uprobe, uc); - ret = register_for_each_vma(uprobe, uc); - if (ret) - __uprobe_unregister(uprobe, uc); - } + consumer_add(uprobe, uc); + ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); - if (unlikely(ret == -EAGAIN)) - goto retry; - return ret; -} + if (ret) { + uprobe_unregister_nosync(uprobe, uc); + /* + * Registration might have partially succeeded, so we can have + * this consumer being called right at this time. We need to + * sync here. It's ok, it's unlikely slow path. + */ + uprobe_unregister_sync(); + return ERR_PTR(ret); + } -int uprobe_register(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc) -{ - return __uprobe_register(inode, offset, 0, uc); + return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); -int uprobe_register_refctr(struct inode *inode, loff_t offset, - loff_t ref_ctr_offset, struct uprobe_consumer *uc) -{ - return __uprobe_register(inode, offset, ref_ctr_offset, uc); -} -EXPORT_SYMBOL_GPL(uprobe_register_refctr); - -/* - * uprobe_apply - unregister an already registered probe. - * @inode: the file in which the probe has to be removed. - * @offset: offset from the start of the file. +/** + * uprobe_apply - add or remove the breakpoints according to @uc->filter + * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints + * Return: 0 on success or negative error code. */ -int uprobe_apply(struct inode *inode, loff_t offset, - struct uprobe_consumer *uc, bool add) +int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { - struct uprobe *uprobe; struct uprobe_consumer *con; - int ret = -ENOENT; - - uprobe = find_uprobe(inode, offset); - if (WARN_ON(!uprobe)) - return ret; + int ret = -ENOENT, srcu_idx; down_write(&uprobe->register_rwsem); - for (con = uprobe->consumers; con && con != uc ; con = con->next) - ; - if (con) - ret = register_for_each_vma(uprobe, add ? uc : NULL); + + srcu_idx = srcu_read_lock(&uprobes_srcu); + list_for_each_entry_srcu(con, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { + if (con == uc) { + ret = register_for_each_vma(uprobe, add ? uc : NULL); + break; + } + } + srcu_read_unlock(&uprobes_srcu, srcu_idx); + up_write(&uprobe->register_rwsem); - put_uprobe(uprobe); return ret; } @@ -1305,15 +1335,17 @@ static void build_probe_list(struct inode *inode, u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; - list_add(&u->pending_list, head); - get_uprobe(u); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; - list_add(&u->pending_list, head); - get_uprobe(u); + /* if uprobe went away, it's safe to ignore it */ + if (try_get_uprobe(u)) + list_add(&u->pending_list, head); } } read_unlock(&uprobes_treelock); @@ -1384,7 +1416,7 @@ int uprobe_mmap(struct vm_area_struct *vma) */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && - filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) { + filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } @@ -1433,6 +1465,21 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } +static vm_fault_t xol_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct xol_area *area = vma->vm_mm->uprobes_state.xol_area; + + vmf->page = area->page; + get_page(vmf->page); + return 0; +} + +static const struct vm_special_mapping xol_mapping = { + .name = "[uprobes]", + .fault = xol_fault, +}; + /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { @@ -1459,7 +1506,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, - &area->xol_mapping); + &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto fail; @@ -1489,7 +1536,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) struct xol_area *area; void *insns; - area = kmalloc(sizeof(*area), GFP_KERNEL); + area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; @@ -1498,13 +1545,9 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->xol_mapping.name = "[uprobes]"; - area->xol_mapping.fault = NULL; - area->xol_mapping.pages = area->pages; - area->pages[0] = alloc_page(GFP_HIGHUSER); - if (!area->pages[0]) + area->page = alloc_page(GFP_HIGHUSER); + if (!area->page) goto free_bitmap; - area->pages[1] = NULL; area->vaddr = vaddr; init_waitqueue_head(&area->wq); @@ -1512,12 +1555,12 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); insns = arch_uprobe_trampoline(&insns_size); - arch_uprobe_copy_ixol(area->pages[0], 0, insns, insns_size); + arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) return area; - __free_page(area->pages[0]); + __free_page(area->page); free_bitmap: kfree(area->bitmap); free_area: @@ -1559,7 +1602,7 @@ void uprobe_clear_state(struct mm_struct *mm) if (!area) return; - put_page(area->pages[0]); + put_page(area->page); kfree(area->bitmap); kfree(area); } @@ -1626,7 +1669,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - arch_uprobe_copy_ixol(area->pages[0], xol_vaddr, + arch_uprobe_copy_ixol(area->page, xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; @@ -1771,6 +1814,12 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return -ENOMEM; *n = *o; + /* + * uprobe's refcnt has to be positive at this point, kept by + * utask->return_instances items; return_instances can't be + * removed right now, as task is blocked due to duping; so + * get_uprobe() is safe to use here. + */ get_uprobe(n->uprobe); n->next = NULL; @@ -1782,12 +1831,6 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return 0; } -static void uprobe_warn(struct task_struct *t, const char *msg) -{ - pr_warn("uprobe: %s:%d failed to %s\n", - current->comm, current->pid, msg); -} - static void dup_xol_work(struct callback_head *work) { if (current->flags & PF_EXITING) @@ -1884,9 +1927,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) return; } + /* we need to bump refcount to store uprobe in utask */ + if (!try_get_uprobe(uprobe)) + return; + ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) - return; + goto fail; trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); @@ -1913,8 +1960,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - - ri->uprobe = get_uprobe(uprobe); + ri->uprobe = uprobe; ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; @@ -1925,8 +1971,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) utask->return_instances = ri; return; - fail: +fail: kfree(ri); + put_uprobe(uprobe); } /* Prepare to single-step probed instruction out of line. */ @@ -1941,9 +1988,14 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) if (!utask) return -ENOMEM; + if (!try_get_uprobe(uprobe)) + return -EINVAL; + xol_vaddr = xol_get_insn_slot(uprobe); - if (!xol_vaddr) - return -ENOMEM; + if (!xol_vaddr) { + err = -ENOMEM; + goto err_out; + } utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; @@ -1951,12 +2003,15 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { xol_free_insn_slot(current); - return err; + goto err_out; } utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; +err_out: + put_uprobe(uprobe); + return err; } /* @@ -2029,13 +2084,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (likely(result == 0)) goto out; - /* - * The NULL 'tsk' here ensures that any faults that occur here - * will not be accounted to the task. 'mm' *is* current->mm, - * but we treat this as a 'remote' access since it is - * essentially a kernel access to the memory. - */ - result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL); + result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result; @@ -2046,7 +2095,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) return is_trap_insn(&opcode); } -static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) +/* assumes being inside RCU protected region */ +static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; @@ -2059,7 +2109,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); - uprobe = find_uprobe(inode, offset); + uprobe = find_uprobe_rcu(inode, offset); } if (!uprobe) @@ -2080,9 +2130,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) struct uprobe_consumer *uc; int remove = UPROBE_HANDLER_REMOVE; bool need_prep = false; /* prepare return uprobe, when needed */ + bool has_consumers = false; + + current->utask->auprobe = &uprobe->arch; - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { int rc = 0; if (uc->handler) { @@ -2095,16 +2148,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) need_prep = true; remove &= rc; + has_consumers = true; } + current->utask->auprobe = NULL; if (need_prep && !remove) prepare_uretprobe(uprobe, regs); /* put bp at return */ - if (remove && uprobe->consumers) { - WARN_ON(!uprobe_is_active(uprobe)); - unapply_uprobe(uprobe, current->mm); + if (remove && has_consumers) { + down_read(&uprobe->register_rwsem); + + /* re-check that removal is still required, this time under lock */ + if (!filter_chain(uprobe, current->mm)) { + WARN_ON(!uprobe_is_active(uprobe)); + unapply_uprobe(uprobe, current->mm); + } + + up_read(&uprobe->register_rwsem); } - up_read(&uprobe->register_rwsem); } static void @@ -2112,13 +2173,15 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) { struct uprobe *uprobe = ri->uprobe; struct uprobe_consumer *uc; + int srcu_idx; - down_read(&uprobe->register_rwsem); - for (uc = uprobe->consumers; uc; uc = uc->next) { + srcu_idx = srcu_read_lock(&uprobes_srcu); + list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, + srcu_read_lock_held(&uprobes_srcu)) { if (uc->ret_handler) uc->ret_handler(uc, ri->func, regs); } - up_read(&uprobe->register_rwsem); + srcu_read_unlock(&uprobes_srcu, srcu_idx); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) @@ -2203,13 +2266,15 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int is_swbp; + int is_swbp, srcu_idx; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + srcu_idx = srcu_read_lock(&uprobes_srcu); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ @@ -2225,7 +2290,7 @@ static void handle_swbp(struct pt_regs *regs) */ instruction_pointer_set(regs, bp_vaddr); } - return; + goto out; } /* change it in advance for ->handler() and restart */ @@ -2260,12 +2325,12 @@ static void handle_swbp(struct pt_regs *regs) if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; - if (!pre_ssout(uprobe, regs, bp_vaddr)) - return; + if (pre_ssout(uprobe, regs, bp_vaddr)) + goto out; - /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ out: - put_uprobe(uprobe); + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ + srcu_read_unlock(&uprobes_srcu, srcu_idx); } /* diff --git a/kernel/exit.c b/kernel/exit.c index 7430852a8571..619f0014c33b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -428,7 +428,7 @@ static void coredump_task_exit(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); + set_current_state(TASK_IDLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); @@ -778,6 +778,62 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } #ifdef CONFIG_DEBUG_STACK_USAGE +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ +# ifdef CONFIG_STACK_GROWSUP + n--; +# else + n++; +# endif + } while (!*n); + +# ifdef CONFIG_STACK_GROWSUP + return (unsigned long)end_of_stack(p) - (unsigned long)n; +# else + return (unsigned long)n - (unsigned long)end_of_stack(p); +# endif +} + +/* Count the maximum pages reached in kernel stacks */ +static inline void kstack_histogram(unsigned long used_stack) +{ +#ifdef CONFIG_VM_EVENT_COUNTERS + if (used_stack <= 1024) + count_vm_event(KSTACK_1K); +#if THREAD_SIZE > 1024 + else if (used_stack <= 2048) + count_vm_event(KSTACK_2K); +#endif +#if THREAD_SIZE > 2048 + else if (used_stack <= 4096) + count_vm_event(KSTACK_4K); +#endif +#if THREAD_SIZE > 4096 + else if (used_stack <= 8192) + count_vm_event(KSTACK_8K); +#endif +#if THREAD_SIZE > 8192 + else if (used_stack <= 16384) + count_vm_event(KSTACK_16K); +#endif +#if THREAD_SIZE > 16384 + else if (used_stack <= 32768) + count_vm_event(KSTACK_32K); +#endif +#if THREAD_SIZE > 32768 + else if (used_stack <= 65536) + count_vm_event(KSTACK_64K); +#endif +#if THREAD_SIZE > 65536 + else + count_vm_event(KSTACK_REST); +#endif +#endif /* CONFIG_VM_EVENT_COUNTERS */ +} + static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); @@ -785,6 +841,7 @@ static void check_stack_usage(void) unsigned long free; free = stack_not_used(current); + kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; diff --git a/kernel/fork.c b/kernel/fork.c index cc760491f201..5ac41db8aa69 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -832,7 +832,7 @@ static void check_mm(struct mm_struct *mm) pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", mm_pgtables_bytes(mm)); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } @@ -1276,7 +1276,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); @@ -1861,7 +1861,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) prev_cputime_init(&sig->prev_cputime); #ifdef CONFIG_POSIX_TIMERS - INIT_LIST_HEAD(&sig->posix_timers); + INIT_HLIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); sig->real_timer.function = it_real_fn; #endif @@ -2311,7 +2311,6 @@ __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; - p->cpuset_slab_spread_rotor = NUMA_NO_NODE; seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS diff --git a/kernel/freezer.c b/kernel/freezer.c index f57aaf96b829..44bbd7dbd2c8 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -72,7 +72,7 @@ bool __refrigerator(bool check_kthr_stop) bool freeze; raw_spin_lock_irq(¤t->pi_lock); - set_current_state(TASK_FROZEN); + WRITE_ONCE(current->__state, TASK_FROZEN); /* unstale saved_state so that __thaw_task() will wake us up */ current->saved_state = TASK_RUNNING; raw_spin_unlock_irq(¤t->pi_lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dc94e0bf2c94..271e9139de77 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -198,7 +198,7 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, irqd_clr_managed_shutdown(d); - if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) { + if (!cpumask_intersects(aff, cpu_online_mask)) { /* * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index eb8628390156..15a7654eff68 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -37,7 +37,7 @@ static inline bool irq_needs_fixup(struct irq_data *d) * has been removed from the online mask already. */ if (cpumask_any_but(m, cpu) < nr_cpu_ids && - cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) { + !cpumask_intersects(m, cpu_online_mask)) { /* * If this happens then there was a missed IRQ fixup at some * point. Warn about it and enforce fixup. @@ -110,7 +110,7 @@ static bool migrate_one_irq(struct irq_desc *desc) if (maskchip && chip->irq_mask) chip->irq_mask(d); - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + if (!cpumask_intersects(affinity, cpu_online_mask)) { /* * If the interrupt is managed, then shut it down and leave * the affinity untouched. diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 3d4036db15ac..1a3d483548e2 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -13,7 +13,6 @@ struct irq_sim_work_ctx { struct irq_work work; - int irq_base; unsigned int irq_count; unsigned long *pending; struct irq_domain *domain; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cea8f6874b1f..e0bff21f30e0 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -128,72 +128,98 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); -static int irq_domain_set_name(struct irq_domain *domain, - const struct fwnode_handle *fwnode, - enum irq_domain_bus_token bus_token) +static int alloc_name(struct irq_domain *domain, char *base, enum irq_domain_bus_token bus_token) +{ + if (bus_token == DOMAIN_BUS_ANY) + domain->name = kasprintf(GFP_KERNEL, "%s", base); + else + domain->name = kasprintf(GFP_KERNEL, "%s-%d", base, bus_token); + if (!domain->name) + return -ENOMEM; + + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + return 0; +} + +static int alloc_fwnode_name(struct irq_domain *domain, const struct fwnode_handle *fwnode, + enum irq_domain_bus_token bus_token, const char *suffix) +{ + const char *sep = suffix ? "-" : ""; + const char *suf = suffix ? : ""; + char *name; + + if (bus_token == DOMAIN_BUS_ANY) + name = kasprintf(GFP_KERNEL, "%pfw%s%s", fwnode, sep, suf); + else + name = kasprintf(GFP_KERNEL, "%pfw%s%s-%d", fwnode, sep, suf, bus_token); + if (!name) + return -ENOMEM; + + /* + * fwnode paths contain '/', which debugfs is legitimately unhappy + * about. Replace them with ':', which does the trick and is not as + * offensive as '\'... + */ + domain->name = strreplace(name, '/', ':'); + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + return 0; +} + +static int alloc_unknown_name(struct irq_domain *domain, enum irq_domain_bus_token bus_token) { static atomic_t unknown_domains; - struct irqchip_fwid *fwid; + int id = atomic_inc_return(&unknown_domains); + + if (bus_token == DOMAIN_BUS_ANY) + domain->name = kasprintf(GFP_KERNEL, "unknown-%d", id); + else + domain->name = kasprintf(GFP_KERNEL, "unknown-%d-%d", id, bus_token); + if (!domain->name) + return -ENOMEM; + + domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + return 0; +} + +static int irq_domain_set_name(struct irq_domain *domain, const struct irq_domain_info *info) +{ + enum irq_domain_bus_token bus_token = info->bus_token; + const struct fwnode_handle *fwnode = info->fwnode; if (is_fwnode_irqchip(fwnode)) { - fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + + /* + * The name_suffix is only intended to be used to avoid a name + * collision when multiple domains are created for a single + * device and the name is picked using a real device node. + * (Typical use-case is regmap-IRQ controllers for devices + * providing more than one physical IRQ.) There should be no + * need to use name_suffix with irqchip-fwnode. + */ + if (info->name_suffix) + return -EINVAL; switch (fwid->type) { case IRQCHIP_FWNODE_NAMED: case IRQCHIP_FWNODE_NAMED_ID: - domain->name = bus_token ? - kasprintf(GFP_KERNEL, "%s-%d", - fwid->name, bus_token) : - kstrdup(fwid->name, GFP_KERNEL); - if (!domain->name) - return -ENOMEM; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - break; + return alloc_name(domain, fwid->name, bus_token); default: domain->name = fwid->name; - if (bus_token) { - domain->name = kasprintf(GFP_KERNEL, "%s-%d", - fwid->name, bus_token); - if (!domain->name) - return -ENOMEM; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - } - break; + if (bus_token != DOMAIN_BUS_ANY) + return alloc_name(domain, fwid->name, bus_token); } - } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || - is_software_node(fwnode)) { - char *name; - - /* - * fwnode paths contain '/', which debugfs is legitimately - * unhappy about. Replace them with ':', which does - * the trick and is not as offensive as '\'... - */ - name = bus_token ? - kasprintf(GFP_KERNEL, "%pfw-%d", fwnode, bus_token) : - kasprintf(GFP_KERNEL, "%pfw", fwnode); - if (!name) - return -ENOMEM; - domain->name = strreplace(name, '/', ':'); - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; + } else if (is_of_node(fwnode) || is_acpi_device_node(fwnode) || is_software_node(fwnode)) { + return alloc_fwnode_name(domain, fwnode, bus_token, info->name_suffix); } - if (!domain->name) { - if (fwnode) - pr_err("Invalid fwnode type for irqdomain\n"); - domain->name = bus_token ? - kasprintf(GFP_KERNEL, "unknown-%d-%d", - atomic_inc_return(&unknown_domains), - bus_token) : - kasprintf(GFP_KERNEL, "unknown-%d", - atomic_inc_return(&unknown_domains)); - if (!domain->name) - return -ENOMEM; - domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED; - } + if (domain->name) + return 0; - return 0; + if (fwnode) + pr_err("Invalid fwnode type for irqdomain\n"); + return alloc_unknown_name(domain, bus_token); } static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info) @@ -211,7 +237,7 @@ static struct irq_domain *__irq_domain_create(const struct irq_domain_info *info if (!domain) return ERR_PTR(-ENOMEM); - err = irq_domain_set_name(domain, info->fwnode, info->bus_token); + err = irq_domain_set_name(domain, info); if (err) { kfree(domain); return ERR_PTR(err); @@ -267,13 +293,20 @@ static void irq_domain_free(struct irq_domain *domain) kfree(domain); } -/** - * irq_domain_instantiate() - Instantiate a new irq domain data structure - * @info: Domain information pointer pointing to the information for this domain - * - * Return: A pointer to the instantiated irq domain or an ERR_PTR value. - */ -struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info) +static void irq_domain_instantiate_descs(const struct irq_domain_info *info) +{ + if (!IS_ENABLED(CONFIG_SPARSE_IRQ)) + return; + + if (irq_alloc_descs(info->virq_base, info->virq_base, info->size, + of_node_to_nid(to_of_node(info->fwnode))) < 0) { + pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", + info->virq_base); + } +} + +static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info *info, + bool cond_alloc_descs, bool force_associate) { struct irq_domain *domain; int err; @@ -306,6 +339,19 @@ struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info) __irq_domain_publish(domain); + if (cond_alloc_descs && info->virq_base > 0) + irq_domain_instantiate_descs(info); + + /* + * Legacy interrupt domains have a fixed Linux interrupt number + * associated. Other interrupt domains can request association by + * providing a Linux interrupt number > 0. + */ + if (force_associate || info->virq_base > 0) { + irq_domain_associate_many(domain, info->virq_base, info->hwirq_base, + info->size - info->hwirq_base); + } + return domain; err_domain_gc_remove: @@ -315,6 +361,17 @@ err_domain_free: irq_domain_free(domain); return ERR_PTR(err); } + +/** + * irq_domain_instantiate() - Instantiate a new irq domain data structure + * @info: Domain information pointer pointing to the information for this domain + * + * Return: A pointer to the instantiated irq domain or an ERR_PTR value. + */ +struct irq_domain *irq_domain_instantiate(const struct irq_domain_info *info) +{ + return __irq_domain_instantiate(info, false, false); +} EXPORT_SYMBOL_GPL(irq_domain_instantiate); /** @@ -413,28 +470,13 @@ struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, .fwnode = fwnode, .size = size, .hwirq_max = size, + .virq_base = first_irq, .ops = ops, .host_data = host_data, }; - struct irq_domain *domain; - - domain = irq_domain_instantiate(&info); - if (IS_ERR(domain)) - return NULL; + struct irq_domain *domain = __irq_domain_instantiate(&info, true, false); - if (first_irq > 0) { - if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { - /* attempt to allocated irq_descs */ - int rc = irq_alloc_descs(first_irq, first_irq, size, - of_node_to_nid(to_of_node(fwnode))); - if (rc < 0) - pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", - first_irq); - } - irq_domain_associate_many(domain, first_irq, 0, size); - } - - return domain; + return IS_ERR(domain) ? NULL : domain; } EXPORT_SYMBOL_GPL(irq_domain_create_simple); @@ -476,18 +518,14 @@ struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, .fwnode = fwnode, .size = first_hwirq + size, .hwirq_max = first_hwirq + size, + .hwirq_base = first_hwirq, + .virq_base = first_irq, .ops = ops, .host_data = host_data, }; - struct irq_domain *domain; + struct irq_domain *domain = __irq_domain_instantiate(&info, false, true); - domain = irq_domain_instantiate(&info); - if (IS_ERR(domain)) - return NULL; - - irq_domain_associate_many(domain, first_irq, first_hwirq, size); - - return domain; + return IS_ERR(domain) ? NULL : domain; } EXPORT_SYMBOL_GPL(irq_domain_create_legacy); @@ -1365,7 +1403,7 @@ static int irq_domain_trim_hierarchy(unsigned int virq) tail = NULL; /* The first entry must have a valid irqchip */ - if (!irq_data->chip || IS_ERR(irq_data->chip)) + if (IS_ERR_OR_NULL(irq_data->chip)) return -EINVAL; /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dd53298ef1a5..f0803d6bd296 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -218,21 +218,20 @@ static void irq_validate_effective_affinity(struct irq_data *data) static inline void irq_validate_effective_affinity(struct irq_data *data) { } #endif +static DEFINE_PER_CPU(struct cpumask, __tmp_mask); + int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { + struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask); struct irq_desc *desc = irq_data_to_desc(data); struct irq_chip *chip = irq_data_get_irq_chip(data); const struct cpumask *prog_mask; int ret; - static DEFINE_RAW_SPINLOCK(tmp_mask_lock); - static struct cpumask tmp_mask; - if (!chip || !chip->irq_set_affinity) return -EINVAL; - raw_spin_lock(&tmp_mask_lock); /* * If this is a managed interrupt and housekeeping is enabled on * it check whether the requested affinity mask intersects with @@ -258,11 +257,11 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); - cpumask_and(&tmp_mask, mask, hk_mask); - if (!cpumask_intersects(&tmp_mask, cpu_online_mask)) + cpumask_and(tmp_mask, mask, hk_mask); + if (!cpumask_intersects(tmp_mask, cpu_online_mask)) prog_mask = mask; else - prog_mask = &tmp_mask; + prog_mask = tmp_mask; } else { prog_mask = mask; } @@ -272,16 +271,14 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, * unless we are being asked to force the affinity (in which * case we do as we are told). */ - cpumask_and(&tmp_mask, prog_mask, cpu_online_mask); - if (!force && !cpumask_empty(&tmp_mask)) - ret = chip->irq_set_affinity(data, &tmp_mask, force); + cpumask_and(tmp_mask, prog_mask, cpu_online_mask); + if (!force && !cpumask_empty(tmp_mask)) + ret = chip->irq_set_affinity(data, tmp_mask, force); else if (force) ret = chip->irq_set_affinity(data, mask, force); else ret = -EINVAL; - raw_spin_unlock(&tmp_mask_lock); - switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 61ca924ef4b4..eb150afd671f 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -26,7 +26,7 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) * The outgoing CPU might be the last online target in a pending * interrupt move. If that's the case clear the pending move bit. */ - if (cpumask_any_and(desc->pending_mask, cpu_online_mask) >= nr_cpu_ids) { + if (!cpumask_intersects(desc->pending_mask, cpu_online_mask)) { irqd_clr_move_pending(data); return false; } @@ -74,7 +74,7 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) { + if (cpumask_intersects(desc->pending_mask, cpu_online_mask)) { int ret; ret = irq_do_set_affinity(data, desc->pending_mask, false); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 5fa0547ece0c..1c7e5159064c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -82,7 +82,7 @@ static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec, desc->dev = dev; desc->nvec_used = nvec; if (affinity) { - desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL); + desc->affinity = kmemdup_array(affinity, nvec, sizeof(*desc->affinity), GFP_KERNEL); if (!desc->affinity) { kfree(desc); return NULL; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 8cccdf40725a..9081ada81c3d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -52,10 +52,8 @@ static int show_irq_affinity(int type, struct seq_file *m) case AFFINITY: case AFFINITY_LIST: mask = desc->irq_common_data.affinity; -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (irqd_is_setaffinity_pending(&desc->irq_data)) - mask = desc->pending_mask; -#endif + if (irq_move_pending(&desc->irq_data)) + mask = irq_desc_get_pending_mask(desc); break; case EFFECTIVE: case EFFECTIVE_LIST: @@ -142,7 +140,7 @@ static ssize_t write_irq_affinity(int type, struct file *file, int err; if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) - return -EIO; + return -EPERM; if (!zalloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; @@ -362,8 +360,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) goto out_unlock; #ifdef CONFIG_SMP + umode_t umode = S_IRUGO; + + if (irq_can_set_affinity_usr(desc->irq_data.irq)) + umode |= S_IWUSR; + /* create /proc/irq/<irq>/smp_affinity */ - proc_create_data("smp_affinity", 0644, desc->dir, + proc_create_data("smp_affinity", umode, desc->dir, &irq_affinity_proc_ops, irqp); /* create /proc/irq/<irq>/affinity_hint */ @@ -371,7 +374,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) irq_affinity_hint_proc_show, irqp); /* create /proc/irq/<irq>/smp_affinity_list */ - proc_create_data("smp_affinity_list", 0644, desc->dir, + proc_create_data("smp_affinity_list", umode, desc->dir, &irq_affinity_list_proc_ops, irqp); proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, diff --git a/kernel/kcov.c b/kernel/kcov.c index 274b6b7c718d..28a6be6e64fd 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/hashtable.h> #include <linux/init.h> +#include <linux/jiffies.h> #include <linux/kmsan-checks.h> #include <linux/mm.h> #include <linux/preempt.h> @@ -1067,6 +1068,32 @@ u64 kcov_common_handle(void) } EXPORT_SYMBOL(kcov_common_handle); +#ifdef CONFIG_KCOV_SELFTEST +static void __init selftest(void) +{ + unsigned long start; + + pr_err("running self test\n"); + /* + * Test that interrupts don't produce spurious coverage. + * The coverage callback filters out interrupt code, but only + * after the handler updates preempt count. Some code periodically + * leaks out of that section and leads to spurious coverage. + * It's hard to call the actual interrupt handler directly, + * so we just loop here for a bit waiting for a timer interrupt. + * We set kcov_mode to enable tracing, but don't setup the area, + * so any attempt to trace will crash. Note: we must not call any + * potentially traced functions in this region. + */ + start = jiffies; + current->kcov_mode = KCOV_MODE_TRACE_PC; + while ((jiffies - start) * MSEC_PER_SEC / HZ < 300) + ; + current->kcov_mode = 0; + pr_err("done running self test\n"); +} +#endif + static int __init kcov_init(void) { int cpu; @@ -1086,6 +1113,10 @@ static int __init kcov_init(void) */ debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops); +#ifdef CONFIG_KCOV_SELFTEST + selftest(); +#endif + return 0; } diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index 1d1d1b0e4248..53b21ae30e00 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -225,7 +225,7 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o { char kbuf[KSYM_NAME_LEN]; char *arg; - int read_len = count < (sizeof(kbuf) - 1) ? count : (sizeof(kbuf) - 1); + const size_t read_len = min(count, sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) return -EFAULT; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 3d64290d24c9..3eedb8c226ad 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -752,7 +752,7 @@ static int kexec_calculate_store_digests(struct kimage *image) #ifdef CONFIG_CRASH_HOTPLUG /* Exclude elfcorehdr segment to allow future changes via hotplug */ - if (j == image->elfcorehdr_index) + if (i == image->elfcorehdr_index) continue; #endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 0349f957e672..7963deac33c3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -56,6 +56,7 @@ #include <linux/kprobes.h> #include <linux/lockdep.h> #include <linux/context_tracking.h> +#include <linux/console.h> #include <asm/sections.h> @@ -573,8 +574,10 @@ static struct lock_trace *save_trace(void) if (!debug_locks_off_graph_unlock()) return NULL; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } @@ -887,11 +890,13 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { instrumentation_begin(); debug_locks_off(); + nbcon_cpu_emergency_enter(); printk(KERN_ERR "BUG: looking up invalid subclass: %u\n", subclass); printk(KERN_ERR "turning off the locking correctness validator.\n"); dump_stack(); + nbcon_cpu_emergency_exit(); instrumentation_end(); return NULL; } @@ -968,11 +973,13 @@ static bool assign_lock_key(struct lockdep_map *lock) else { /* Debug-check: all keys must be persistent! */ debug_locks_off(); + nbcon_cpu_emergency_enter(); pr_err("INFO: trying to register non-static key.\n"); pr_err("The code is fine but needs lockdep annotation, or maybe\n"); pr_err("you didn't initialize this object before use?\n"); pr_err("turning off the locking correctness validator.\n"); dump_stack(); + nbcon_cpu_emergency_exit(); return false; } @@ -1316,8 +1323,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) return NULL; } + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } nr_lock_classes++; @@ -1349,11 +1358,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); + nbcon_cpu_emergency_enter(); printk("\nnew class %px: %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); dump_stack(); + nbcon_cpu_emergency_exit(); if (!graph_lock()) { return NULL; @@ -1392,8 +1403,10 @@ static struct lock_list *alloc_list_entry(void) if (!debug_locks_off_graph_unlock()) return NULL; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return NULL; } nr_list_entries++; @@ -2039,6 +2052,8 @@ static noinline void print_circular_bug(struct lock_list *this, depth = get_lock_depth(target); + nbcon_cpu_emergency_enter(); + print_circular_bug_header(target, depth, check_src, check_tgt); parent = get_lock_parent(target); @@ -2057,6 +2072,8 @@ static noinline void print_circular_bug(struct lock_list *this, printk("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static noinline void print_bfs_bug(int ret) @@ -2569,6 +2586,8 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=====================================================\n"); pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", @@ -2618,11 +2637,13 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn(" and %s-irq-unsafe lock:\n", irqclass); next_root->trace = save_trace(); if (!next_root->trace) - return; + goto out; print_shortest_lock_dependencies(forwards_entry, next_root); pr_warn("\nstack backtrace:\n"); dump_stack(); +out: + nbcon_cpu_emergency_exit(); } static const char *state_names[] = { @@ -2987,6 +3008,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("============================================\n"); pr_warn("WARNING: possible recursive locking detected\n"); @@ -3009,6 +3032,8 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } /* @@ -3606,6 +3631,8 @@ static void print_collision(struct task_struct *curr, struct held_lock *hlock_next, struct lock_chain *chain) { + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("============================\n"); pr_warn("WARNING: chain_key collision\n"); @@ -3622,6 +3649,8 @@ static void print_collision(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } #endif @@ -3712,8 +3741,10 @@ static inline int add_chain_cache(struct task_struct *curr, if (!debug_locks_off_graph_unlock()) return 0; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } chain->chain_key = chain_key; @@ -3730,8 +3761,10 @@ static inline int add_chain_cache(struct task_struct *curr, if (!debug_locks_off_graph_unlock()) return 0; + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } @@ -3970,6 +4003,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("================================\n"); pr_warn("WARNING: inconsistent lock state\n"); @@ -3998,6 +4033,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } /* @@ -4032,6 +4069,8 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("========================================================\n"); pr_warn("WARNING: possible irq lock inversion dependency detected\n"); @@ -4072,11 +4111,13 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); root->trace = save_trace(); if (!root->trace) - return; + goto out; print_shortest_lock_dependencies(other, root); pr_warn("\nstack backtrace:\n"); dump_stack(); +out: + nbcon_cpu_emergency_exit(); } /* @@ -4153,6 +4194,8 @@ void print_irqtrace_events(struct task_struct *curr) { const struct irqtrace_events *trace = &curr->irqtrace; + nbcon_cpu_emergency_enter(); + printk("irq event stamp: %u\n", trace->irq_events); printk("hardirqs last enabled at (%u): [<%px>] %pS\n", trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip, @@ -4166,6 +4209,8 @@ void print_irqtrace_events(struct task_struct *curr) printk("softirqs last disabled at (%u): [<%px>] %pS\n", trace->softirq_disable_event, (void *)trace->softirq_disable_ip, (void *)trace->softirq_disable_ip); + + nbcon_cpu_emergency_exit(); } static int HARDIRQ_verbose(struct lock_class *class) @@ -4686,10 +4731,12 @@ unlock: * We must printk outside of the graph_lock: */ if (ret == 2) { + nbcon_cpu_emergency_enter(); printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); print_lock(this); print_irqtrace_events(curr); dump_stack(); + nbcon_cpu_emergency_exit(); } return ret; @@ -4730,6 +4777,8 @@ print_lock_invalid_wait_context(struct task_struct *curr, if (debug_locks_silent) return 0; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=============================\n"); pr_warn("[ BUG: Invalid wait context ]\n"); @@ -4749,6 +4798,8 @@ print_lock_invalid_wait_context(struct task_struct *curr, pr_warn("stack backtrace:\n"); dump_stack(); + nbcon_cpu_emergency_exit(); + return 0; } @@ -4956,6 +5007,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("==================================\n"); pr_warn("WARNING: Nested lock was not taken\n"); @@ -4976,6 +5029,8 @@ print_lock_nested_lock_not_held(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static int __lock_is_held(const struct lockdep_map *lock, int read); @@ -5024,11 +5079,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, debug_class_ops_inc(class); if (very_verbose(class)) { + nbcon_cpu_emergency_enter(); printk("\nacquire class [%px] %s", class->key, class->name); if (class->name_version > 1) printk(KERN_CONT "#%d", class->name_version); printk(KERN_CONT "\n"); dump_stack(); + nbcon_cpu_emergency_exit(); } /* @@ -5155,6 +5212,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); + nbcon_cpu_emergency_enter(); print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); printk(KERN_DEBUG "depth: %i max: %lu!\n", curr->lockdep_depth, MAX_LOCK_DEPTH); @@ -5162,6 +5220,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, lockdep_print_held_locks(current); debug_show_all_locks(); dump_stack(); + nbcon_cpu_emergency_exit(); return 0; } @@ -5181,6 +5240,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=====================================\n"); pr_warn("WARNING: bad unlock balance detected!\n"); @@ -5197,6 +5258,8 @@ static void print_unlock_imbalance_bug(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static noinstr int match_held_lock(const struct held_lock *hlock, @@ -5901,6 +5964,8 @@ static void print_lock_contention_bug(struct task_struct *curr, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=================================\n"); pr_warn("WARNING: bad contention detected!\n"); @@ -5917,6 +5982,8 @@ static void print_lock_contention_bug(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static void @@ -6536,6 +6603,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("=========================\n"); pr_warn("WARNING: held lock freed!\n"); @@ -6548,6 +6617,8 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } static inline int not_in_range(const void* mem_from, unsigned long mem_len, @@ -6594,6 +6665,8 @@ static void print_held_locks_bug(void) if (debug_locks_silent) return; + nbcon_cpu_emergency_enter(); + pr_warn("\n"); pr_warn("====================================\n"); pr_warn("WARNING: %s/%d still has locks held!\n", @@ -6603,6 +6676,8 @@ static void print_held_locks_bug(void) lockdep_print_held_locks(current); pr_warn("\nstack backtrace:\n"); dump_stack(); + + nbcon_cpu_emergency_exit(); } void debug_check_no_locks_held(void) @@ -6660,6 +6735,7 @@ asmlinkage __visible void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("================================================\n"); pr_warn("WARNING: lock held when returning to user space!\n"); @@ -6668,6 +6744,7 @@ asmlinkage __visible void lockdep_sys_exit(void) pr_warn("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); + nbcon_cpu_emergency_exit(); } /* @@ -6684,6 +6761,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) bool rcu = warn_rcu_enter(); /* Note: the following can be executed concurrently, so be careful. */ + nbcon_cpu_emergency_enter(); pr_warn("\n"); pr_warn("=============================\n"); pr_warn("WARNING: suspicious RCU usage\n"); @@ -6722,6 +6800,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + nbcon_cpu_emergency_exit(); warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 88d08eeb8bc0..ebebd0eec7f6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -347,7 +347,7 @@ static __always_inline int __waiter_prio(struct task_struct *task) { int prio = task->prio; - if (!rt_prio(prio)) + if (!rt_or_dl_prio(prio)) return DEFAULT_PRIO; return prio; @@ -435,7 +435,7 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, * Note that RT tasks are excluded from same priority (lateral) * steals to prevent the introduction of an unbounded latency. */ - if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) + if (rt_or_dl_prio(waiter->tree.prio)) return false; return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); @@ -1644,6 +1644,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, } static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, + struct rt_mutex_base *lock, struct rt_mutex_waiter *w) { /* @@ -1656,10 +1657,10 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, if (build_ww_mutex() && w->ww_ctx) return; - /* - * Yell loudly and stop the task right here. - */ + raw_spin_unlock_irq(&lock->wait_lock); + WARN(1, "rtmutex deadlock detected\n"); + while (1) { set_current_state(TASK_INTERRUPTIBLE); rt_mutex_schedule(); @@ -1713,7 +1714,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, } else { __set_current_state(TASK_RUNNING); remove_waiter(lock, waiter); - rt_mutex_handle_deadlock(ret, chwalk, waiter); + rt_mutex_handle_deadlock(ret, chwalk, lock, waiter); } /* diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 33cac79e3994..5ded7dff46ef 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -631,7 +631,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * if it is an RT task or wait in the wait queue * for too long. */ - if (has_handoff || (!rt_task(waiter->task) && + if (has_handoff || (!rt_or_dl_task(waiter->task) && !time_after(jiffies, waiter->timeout))) return false; @@ -914,7 +914,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) if (owner_state != OWNER_WRITER) { if (need_resched()) break; - if (rt_task(current) && + if (rt_or_dl_task(current) && (prev_owner_state != OWNER_WRITER)) break; } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 3ad2cc4823e5..76d204b7d29c 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) int a_prio = a->task->prio; int b_prio = b->task->prio; - if (rt_prio(a_prio) || rt_prio(b_prio)) { + if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) { if (a_prio > b_prio) return true; diff --git a/kernel/module/Makefile b/kernel/module/Makefile index a10b2b9a6fdf..50ffcc413b54 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -5,7 +5,7 @@ # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. -KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_main.o := n obj-y += main.o obj-y += strict_rwx.o diff --git a/kernel/numa.c b/kernel/numa.c deleted file mode 100644 index 67ca6b8585c0..000000000000 --- a/kernel/numa.c +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later - -#include <linux/printk.h> -#include <linux/numa.h> - -/* Stub functions: */ - -#ifndef memory_add_physaddr_to_nid -int memory_add_physaddr_to_nid(u64 start) -{ - pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - -#ifndef phys_to_target_node -int phys_to_target_node(u64 start) -{ - pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} -EXPORT_SYMBOL_GPL(phys_to_target_node); -#endif diff --git a/kernel/padata.c b/kernel/padata.c index 0fa6c2895460..d899f34558af 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -404,7 +404,8 @@ void padata_do_serial(struct padata_priv *padata) /* Sort in ascending order of sequence number. */ list_for_each_prev(pos, &reorder->list) { cur = list_entry(pos, struct padata_priv, list); - if (cur->seq_nr < padata->seq_nr) + /* Compare by difference to consider integer wrap around */ + if ((signed int)(cur->seq_nr - padata->seq_nr) < 0) break; } list_add(&padata->list, pos); @@ -512,9 +513,12 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) * thread function. Load balance large jobs between threads by * increasing the number of chunks, guarantee at least the minimum * chunk size from the caller, and honor the caller's alignment. + * Ensure chunk_size is at least 1 to prevent divide-by-0 + * panic in padata_mt_helper(). */ ps.chunk_size = job->size / (ps.nworks * load_balance_factor); ps.chunk_size = max(ps.chunk_size, job->min_chunk); + ps.chunk_size = max(ps.chunk_size, 1ul); ps.chunk_size = roundup(ps.chunk_size, job->align); /* diff --git a/kernel/panic.c b/kernel/panic.c index 2a0449144f82..fbc59b3b64d0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -374,6 +374,8 @@ void panic(const char *fmt, ...) panic_other_cpus_shutdown(_crash_kexec_post_notifiers); + printk_legacy_allow_panic_sync(); + /* * Run any panic handlers, including those that might need to * add information to the kmsg dump output. @@ -382,7 +384,7 @@ void panic(const char *fmt, ...) panic_print_sys_info(false); - kmsg_dump(KMSG_DUMP_PANIC); + kmsg_dump_desc(KMSG_DUMP_PANIC, buf); /* * If you doubt kdump always works fine in any situation, @@ -463,6 +465,7 @@ void panic(const char *fmt, ...) * Explicitly flush the kernel log buffer one last time. */ console_flush_on_panic(CONSOLE_FLUSH_PENDING); + nbcon_atomic_flush_unsafe(); local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { @@ -682,6 +685,7 @@ bool oops_may_print(void) */ void oops_enter(void) { + nbcon_cpu_emergency_enter(); tracing_off(); /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); @@ -704,6 +708,7 @@ void oops_exit(void) { do_oops_enter_exit(); print_oops_end_marker(); + nbcon_cpu_emergency_exit(); kmsg_dump(KMSG_DUMP_OOPS); } @@ -715,6 +720,8 @@ struct warn_args { void __warn(const char *file, int line, void *caller, unsigned taint, struct pt_regs *regs, struct warn_args *args) { + nbcon_cpu_emergency_enter(); + disable_trace_on_warning(); if (file) @@ -750,6 +757,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); + + nbcon_cpu_emergency_exit(); } #ifdef CONFIG_BUG diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 0a213f69a9e4..e35829d36039 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1123,11 +1123,11 @@ static const char * const hibernation_modes[] = { static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + ssize_t count = 0; int i; - char *start = buf; if (!hibernation_available()) - return sprintf(buf, "[disabled]\n"); + return sysfs_emit(buf, "[disabled]\n"); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (!hibernation_modes[i]) @@ -1147,12 +1147,16 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, continue; } if (i == hibernation_mode) - buf += sprintf(buf, "[%s] ", hibernation_modes[i]); + count += sysfs_emit_at(buf, count, "[%s] ", hibernation_modes[i]); else - buf += sprintf(buf, "%s ", hibernation_modes[i]); + count += sysfs_emit_at(buf, count, "%s ", hibernation_modes[i]); } - buf += sprintf(buf, "\n"); - return buf-start; + + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; + + return count; } static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1210,8 +1214,8 @@ power_attr(disk); static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d:%d\n", MAJOR(swsusp_resume_device), - MINOR(swsusp_resume_device)); + return sysfs_emit(buf, "%d:%d\n", MAJOR(swsusp_resume_device), + MINOR(swsusp_resume_device)); } static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1270,7 +1274,7 @@ power_attr(resume); static ssize_t resume_offset_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)swsusp_resume_block); + return sysfs_emit(buf, "%llu\n", (unsigned long long)swsusp_resume_block); } static ssize_t resume_offset_store(struct kobject *kobj, @@ -1293,7 +1297,7 @@ power_attr(resume_offset); static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", image_size); + return sysfs_emit(buf, "%lu\n", image_size); } static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1314,7 +1318,7 @@ power_attr(image_size); static ssize_t reserved_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", reserved_size); + return sysfs_emit(buf, "%lu\n", reserved_size); } static ssize_t reserved_size_store(struct kobject *kobj, diff --git a/kernel/power/main.c b/kernel/power/main.c index a9e0693aaf69..6254814d4817 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -115,7 +115,7 @@ int pm_async_enabled = 1; static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_async_enabled); + return sysfs_emit(buf, "%d\n", pm_async_enabled); } static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -139,7 +139,7 @@ power_attr(pm_async); static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { @@ -149,17 +149,17 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, const char *label = mem_sleep_states[i]; if (mem_sleep_current == i) - s += sprintf(s, "[%s] ", label); + count += sysfs_emit_at(buf, count, "[%s] ", label); else - s += sprintf(s, "%s ", label); + count += sysfs_emit_at(buf, count, "%s ", label); } } /* Convert the last space to a newline if needed. */ - if (s != buf) - *(s-1) = '\n'; + if (count > 0) + buf[count - 1] = '\n'; - return (s - buf); + return count; } static suspend_state_t decode_suspend_state(const char *buf, size_t n) @@ -220,7 +220,7 @@ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); static ssize_t sync_on_suspend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", sync_on_suspend_enabled); + return sysfs_emit(buf, "%d\n", sync_on_suspend_enabled); } static ssize_t sync_on_suspend_store(struct kobject *kobj, @@ -257,22 +257,22 @@ static const char * const pm_tests[__TEST_AFTER_LAST] = { static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; int level; for (level = TEST_FIRST; level <= TEST_MAX; level++) if (pm_tests[level]) { if (level == pm_test_level) - s += sprintf(s, "[%s] ", pm_tests[level]); + count += sysfs_emit_at(buf, count, "[%s] ", pm_tests[level]); else - s += sprintf(s, "%s ", pm_tests[level]); + count += sysfs_emit_at(buf, count, "%s ", pm_tests[level]); } - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; - return (s - buf); + return count; } static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -390,7 +390,7 @@ static const char * const suspend_step_names[] = { static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ - return sprintf(buf, format_str, suspend_stats._name); \ + return sysfs_emit(buf, format_str, suspend_stats._name);\ } \ static struct kobj_attribute _name = __ATTR_RO(_name) @@ -404,7 +404,7 @@ suspend_attr(max_hw_sleep, "%llu\n"); static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ - return sprintf(buf, "%u\n", \ + return sysfs_emit(buf, "%u\n", \ suspend_stats.step_failures[step-1]); \ } \ static struct kobj_attribute _name = __ATTR_RO(_name) @@ -428,7 +428,7 @@ static ssize_t last_failed_dev_show(struct kobject *kobj, index %= REC_FAILED_NUM; last_failed_dev = suspend_stats.failed_devs[index]; - return sprintf(buf, "%s\n", last_failed_dev); + return sysfs_emit(buf, "%s\n", last_failed_dev); } static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); @@ -442,7 +442,7 @@ static ssize_t last_failed_errno_show(struct kobject *kobj, index %= REC_FAILED_NUM; last_failed_errno = suspend_stats.errno[index]; - return sprintf(buf, "%d\n", last_failed_errno); + return sysfs_emit(buf, "%d\n", last_failed_errno); } static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); @@ -456,7 +456,7 @@ static ssize_t last_failed_step_show(struct kobject *kobj, index %= REC_FAILED_NUM; step = suspend_stats.failed_steps[index]; - return sprintf(buf, "%s\n", suspend_step_names[step]); + return sysfs_emit(buf, "%s\n", suspend_step_names[step]); } static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); @@ -571,7 +571,7 @@ bool pm_print_times_enabled; static ssize_t pm_print_times_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_print_times_enabled); + return sysfs_emit(buf, "%d\n", pm_print_times_enabled); } static ssize_t pm_print_times_store(struct kobject *kobj, @@ -604,7 +604,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, if (!pm_wakeup_irq()) return -ENODATA; - return sprintf(buf, "%u\n", pm_wakeup_irq()); + return sysfs_emit(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); @@ -620,7 +620,7 @@ EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_debug_messages_on); + return sysfs_emit(buf, "%d\n", pm_debug_messages_on); } static ssize_t pm_debug_messages_store(struct kobject *kobj, @@ -668,21 +668,23 @@ struct kobject *power_kobj; static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - char *s = buf; + ssize_t count = 0; #ifdef CONFIG_SUSPEND suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (pm_states[i]) - s += sprintf(s,"%s ", pm_states[i]); + count += sysfs_emit_at(buf, count, "%s ", pm_states[i]); #endif if (hibernation_available()) - s += sprintf(s, "disk "); - if (s != buf) - /* convert the last space to a newline */ - *(s-1) = '\n'; - return (s - buf); + count += sysfs_emit_at(buf, count, "disk "); + + /* Convert the last space to a newline if needed. */ + if (count > 0) + buf[count - 1] = '\n'; + + return count; } static suspend_state_t decode_state(const char *buf, size_t n) @@ -782,7 +784,7 @@ static ssize_t wakeup_count_show(struct kobject *kobj, unsigned int val; return pm_get_wakeup_count(&val, true) ? - sprintf(buf, "%u\n", val) : -EINTR; + sysfs_emit(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, @@ -824,17 +826,17 @@ static ssize_t autosleep_show(struct kobject *kobj, suspend_state_t state = pm_autosleep_state(); if (state == PM_SUSPEND_ON) - return sprintf(buf, "off\n"); + return sysfs_emit(buf, "off\n"); #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) - return sprintf(buf, "%s\n", pm_states[state] ? + return sysfs_emit(buf, "%s\n", pm_states[state] ? pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION - return sprintf(buf, "disk\n"); + return sysfs_emit(buf, "disk\n"); #else - return sprintf(buf, "error"); + return sysfs_emit(buf, "error\n"); #endif } @@ -903,7 +905,7 @@ int pm_trace_enabled; static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", pm_trace_enabled); + return sysfs_emit(buf, "%d\n", pm_trace_enabled); } static ssize_t @@ -940,7 +942,7 @@ power_attr_ro(pm_trace_dev_match); static ssize_t pm_freeze_timeout_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", freeze_timeout_msecs); + return sysfs_emit(buf, "%u\n", freeze_timeout_msecs); } static ssize_t pm_freeze_timeout_store(struct kobject *kobj, diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 405eddbda4fc..30894d8f0a78 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1365,11 +1365,6 @@ static unsigned int count_highmem_pages(void) } return n; } -#else -static inline void *saveable_highmem_page(struct zone *z, unsigned long p) -{ - return NULL; -} #endif /* CONFIG_HIGHMEM */ /** diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 19dcc5832651..3fcb48502adb 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -2,11 +2,12 @@ /* * internal.h - printk internal definitions */ -#include <linux/percpu.h> #include <linux/console.h> -#include "printk_ringbuffer.h" +#include <linux/percpu.h> +#include <linux/types.h> #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) +struct ctl_table; void __init printk_sysctl_init(void); int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); @@ -20,6 +21,19 @@ int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, (con->flags & CON_BOOT) ? "boot" : "", \ con->name, con->index, ##__VA_ARGS__) +/* + * Identify if legacy printing is forced in a dedicated kthread. If + * true, all printing via console lock occurs within a dedicated + * legacy printer thread. The only exception is on panic, after the + * nbcon consoles have had their chance to print the panic messages + * first. + */ +#ifdef CONFIG_PREEMPT_RT +# define force_legacy_kthread() (true) +#else +# define force_legacy_kthread() (false) +#endif + #ifdef CONFIG_PRINTK #ifdef CONFIG_PRINTK_CALLER @@ -43,7 +57,11 @@ enum printk_info_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; +struct printk_ringbuffer; +struct dev_printk_info; + extern struct printk_ringbuffer *prb; +extern bool printk_kthreads_running; __printf(4, 0) int vprintk_store(int facility, int level, @@ -53,6 +71,9 @@ int vprintk_store(int facility, int level, __printf(1, 0) int vprintk_default(const char *fmt, va_list args); __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); +void __printk_safe_enter(void); +void __printk_safe_exit(void); + bool printk_percpu_data_ready(void); #define printk_safe_enter_irqsave(flags) \ @@ -68,15 +89,85 @@ bool printk_percpu_data_ready(void); } while (0) void defer_console_output(void); +bool is_printk_legacy_deferred(void); u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags); +void console_lock_spinning_enable(void); +int console_lock_spinning_disable_and_check(int cookie); u64 nbcon_seq_read(struct console *con); void nbcon_seq_force(struct console *con, u64 seq); bool nbcon_alloc(struct console *con); -void nbcon_init(struct console *con); void nbcon_free(struct console *con); +enum nbcon_prio nbcon_get_default_prio(void); +void nbcon_atomic_flush_pending(void); +bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, + int cookie, bool use_atomic); +bool nbcon_kthread_create(struct console *con); +void nbcon_kthread_stop(struct console *con); +void nbcon_kthreads_wake(void); + +/* + * Check if the given console is currently capable and allowed to print + * records. Note that this function does not consider the current context, + * which can also play a role in deciding if @con can be used to print + * records. + */ +static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) +{ + if (!(flags & CON_ENABLED)) + return false; + + if ((flags & CON_SUSPENDED)) + return false; + + if (flags & CON_NBCON) { + /* The write_atomic() callback is optional. */ + if (use_atomic && !con->write_atomic) + return false; + + /* + * For the !use_atomic case, @printk_kthreads_running is not + * checked because the write_thread() callback is also used + * via the legacy loop when the printer threads are not + * available. + */ + } else { + if (!con->write) + return false; + } + + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) + return false; + + return true; +} + +/** + * nbcon_kthread_wake - Wake up a console printing thread + * @con: Console to operate on + */ +static inline void nbcon_kthread_wake(struct console *con) +{ + /* + * Guarantee any new records can be seen by tasks preparing to wait + * before this context checks if the rcuwait is empty. + * + * The full memory barrier in rcuwait_wake_up() pairs with the full + * memory barrier within set_current_state() of + * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait() + * adds the waiter but before it has checked the wait condition. + * + * This pairs with nbcon_kthread_func:A. + */ + rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */ +} #else @@ -84,6 +175,8 @@ void nbcon_free(struct console *con); #define PRINTK_MESSAGE_MAX 0 #define PRINTKRB_RECORD_MAX 0 +#define printk_kthreads_running (false) + /* * In !PRINTK builds we still export console_sem * semaphore and some of console functions (console_unlock()/etc.), so @@ -93,14 +186,119 @@ void nbcon_free(struct console *con); #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) static inline bool printk_percpu_data_ready(void) { return false; } +static inline void defer_console_output(void) { } +static inline bool is_printk_legacy_deferred(void) { return false; } static inline u64 nbcon_seq_read(struct console *con) { return 0; } static inline void nbcon_seq_force(struct console *con, u64 seq) { } static inline bool nbcon_alloc(struct console *con) { return false; } -static inline void nbcon_init(struct console *con) { } static inline void nbcon_free(struct console *con) { } +static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; } +static inline void nbcon_atomic_flush_pending(void) { } +static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, + int cookie, bool use_atomic) { return false; } +static inline void nbcon_kthread_wake(struct console *con) { } +static inline void nbcon_kthreads_wake(void) { } + +static inline bool console_is_usable(struct console *con, short flags, + bool use_atomic) { return false; } #endif /* CONFIG_PRINTK */ +extern bool have_boot_console; +extern bool have_nbcon_console; +extern bool have_legacy_console; +extern bool legacy_allow_panic_sync; + +/** + * struct console_flush_type - Define available console flush methods + * @nbcon_atomic: Flush directly using nbcon_atomic() callback + * @nbcon_offload: Offload flush to printer thread + * @legacy_direct: Call the legacy loop in this context + * @legacy_offload: Offload the legacy loop into IRQ or legacy thread + * + * Note that the legacy loop also flushes the nbcon consoles. + */ +struct console_flush_type { + bool nbcon_atomic; + bool nbcon_offload; + bool legacy_direct; + bool legacy_offload; +}; + +/* + * Identify which console flushing methods should be used in the context of + * the caller. + */ +static inline void printk_get_console_flush_type(struct console_flush_type *ft) +{ + memset(ft, 0, sizeof(*ft)); + + switch (nbcon_get_default_prio()) { + case NBCON_PRIO_NORMAL: + if (have_nbcon_console && !have_boot_console) { + if (printk_kthreads_running) + ft->nbcon_offload = true; + else + ft->nbcon_atomic = true; + } + + /* Legacy consoles are flushed directly when possible. */ + if (have_legacy_console || have_boot_console) { + if (!is_printk_legacy_deferred()) + ft->legacy_direct = true; + else + ft->legacy_offload = true; + } + break; + + case NBCON_PRIO_EMERGENCY: + if (have_nbcon_console && !have_boot_console) + ft->nbcon_atomic = true; + + /* Legacy consoles are flushed directly when possible. */ + if (have_legacy_console || have_boot_console) { + if (!is_printk_legacy_deferred()) + ft->legacy_direct = true; + else + ft->legacy_offload = true; + } + break; + + case NBCON_PRIO_PANIC: + /* + * In panic, the nbcon consoles will directly print. But + * only allowed if there are no boot consoles. + */ + if (have_nbcon_console && !have_boot_console) + ft->nbcon_atomic = true; + + if (have_legacy_console || have_boot_console) { + /* + * This is the same decision as NBCON_PRIO_NORMAL + * except that offloading never occurs in panic. + * + * Note that console_flush_on_panic() will flush + * legacy consoles anyway, even if unsafe. + */ + if (!is_printk_legacy_deferred()) + ft->legacy_direct = true; + + /* + * In panic, if nbcon atomic printing occurs, + * the legacy consoles must remain silent until + * explicitly allowed. + */ + if (ft->nbcon_atomic && !legacy_allow_panic_sync) + ft->legacy_direct = false; + } + break; + + default: + WARN_ON_ONCE(1); + break; + } +} + extern struct printk_buffers printk_shared_pbufs; /** @@ -135,4 +333,5 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, #ifdef CONFIG_PRINTK void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped); +void console_prepend_replay(struct printk_message *pmsg); #endif diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index c8093bcc01fe..fd12efcc4aed 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -2,11 +2,25 @@ // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner -#include <linux/kernel.h> +#include <linux/atomic.h> +#include <linux/bug.h> #include <linux/console.h> #include <linux/delay.h> +#include <linux/errno.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/irqflags.h> +#include <linux/kthread.h> +#include <linux/minmax.h> +#include <linux/percpu.h> +#include <linux/preempt.h> #include <linux/slab.h> +#include <linux/smp.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> #include "internal.h" +#include "printk_ringbuffer.h" /* * Printk console printing implementation for consoles which does not depend * on the legacy style console_lock mechanism. @@ -172,9 +186,6 @@ void nbcon_seq_force(struct console *con, u64 seq) u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb)); atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq)); - - /* Clear con->seq since nbcon consoles use con->nbcon_seq instead. */ - con->seq = 0; } /** @@ -231,6 +242,13 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, struct nbcon_state new; do { + /* + * Panic does not imply that the console is owned. However, it + * is critical that non-panic CPUs during panic are unable to + * acquire ownership in order to satisfy the assumptions of + * nbcon_waiter_matches(). In particular, the assumption that + * lower priorities are ignored during panic. + */ if (other_cpu_in_panic()) return -EPERM; @@ -262,18 +280,29 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) /* * The request context is well defined by the @req_prio because: * - * - Only a context with a higher priority can take over the request. + * - Only a context with a priority higher than the owner can become + * a waiter. + * - Only a context with a priority higher than the waiter can + * directly take over the request. * - There are only three priorities. * - Only one CPU is allowed to request PANIC priority. * - Lower priorities are ignored during panic() until reboot. * * As a result, the following scenario is *not* possible: * - * 1. Another context with a higher priority directly takes ownership. - * 2. The higher priority context releases the ownership. - * 3. A lower priority context takes the ownership. - * 4. Another context with the same priority as this context + * 1. This context is currently a waiter. + * 2. Another context with a higher priority than this context + * directly takes ownership. + * 3. The higher priority context releases the ownership. + * 4. Another lower priority context takes the ownership. + * 5. Another context with the same priority as this context * creates a request and starts waiting. + * + * Event #1 implies this context is EMERGENCY. + * Event #2 implies the new context is PANIC. + * Event #3 occurs when panic() has flushed the console. + * Events #4 and #5 are not possible due to the other_cpu_in_panic() + * check in nbcon_context_try_acquire_direct(). */ return (cur->req_prio == expected_prio); @@ -531,6 +560,7 @@ static struct printk_buffers panic_nbcon_pbufs; * nbcon_context_try_acquire - Try to acquire nbcon console * @ctxt: The context of the caller * + * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. * * If the caller allowed an unsafe hostile takeover, on success the @@ -538,7 +568,6 @@ static struct printk_buffers panic_nbcon_pbufs; * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ -__maybe_unused static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) { unsigned int cpu = smp_processor_id(); @@ -581,11 +610,29 @@ static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu, int expected_prio) { /* - * Since consoles can only be acquired by higher priorities, - * owning contexts are uniquely identified by @prio. However, - * since contexts can unexpectedly lose ownership, it is - * possible that later another owner appears with the same - * priority. For this reason @cpu is also needed. + * A similar function, nbcon_waiter_matches(), only deals with + * EMERGENCY and PANIC priorities. However, this function must also + * deal with the NORMAL priority, which requires additional checks + * and constraints. + * + * For the case where preemption and interrupts are disabled, it is + * enough to also verify that the owning CPU has not changed. + * + * For the case where preemption or interrupts are enabled, an + * external synchronization method *must* be used. In particular, + * the driver-specific locking mechanism used in device_lock() + * (including disabling migration) should be used. It prevents + * scenarios such as: + * + * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and + * is scheduled out. + * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY + * and releases it. + * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X] + * and is scheduled out. + * 4. [Task A] gets running on [CPU X] and sees that the console is + * still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus + * [Task A] thinks it is the owner when it is not. */ if (cur->prio != expected_prio) @@ -784,6 +831,19 @@ out: return nbcon_context_can_proceed(ctxt, &cur); } +static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + struct nbcon_state cur; + + wctxt->outbuf = buf; + wctxt->len = len; + nbcon_state_read(con, &cur); + wctxt->unsafe_takeover = cur.unsafe_takeover; +} + /** * nbcon_enter_unsafe - Enter an unsafe region in the driver * @wctxt: The write context that was handed to the write function @@ -799,8 +859,12 @@ out: bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + bool is_owner; - return nbcon_context_enter_unsafe(ctxt); + is_owner = nbcon_context_enter_unsafe(ctxt); + if (!is_owner) + nbcon_write_context_set_buf(wctxt, NULL, 0); + return is_owner; } EXPORT_SYMBOL_GPL(nbcon_enter_unsafe); @@ -819,14 +883,47 @@ EXPORT_SYMBOL_GPL(nbcon_enter_unsafe); bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + bool ret; - return nbcon_context_exit_unsafe(ctxt); + ret = nbcon_context_exit_unsafe(ctxt); + if (!ret) + nbcon_write_context_set_buf(wctxt, NULL, 0); + return ret; } EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); /** + * nbcon_reacquire_nobuf - Reacquire a console after losing ownership + * while printing + * @wctxt: The write context that was handed to the write callback + * + * Since ownership can be lost at any time due to handover or takeover, a + * printing context _must_ be prepared to back out immediately and + * carefully. However, there are scenarios where the printing context must + * reacquire ownership in order to finalize or revert hardware changes. + * + * This function allows a printing context to reacquire ownership using the + * same priority as its previous ownership. + * + * Note that after a successful reacquire the printing context will have no + * output buffer because that has been lost. This function cannot be used to + * resume printing. + */ +void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + + while (!nbcon_context_try_acquire(ctxt)) + cpu_relax(); + + nbcon_write_context_set_buf(wctxt, NULL, 0); +} +EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf); + +/** * nbcon_emit_next_record - Emit a record in the acquired context * @wctxt: The write context that will be handed to the write function + * @use_atomic: True if the write_atomic() callback is to be used * * Return: True if this context still owns the console. False if * ownership was handed over or taken. @@ -840,8 +937,7 @@ EXPORT_SYMBOL_GPL(nbcon_exit_unsafe); * When true is returned, @wctxt->ctxt.backlog indicates whether there are * still records pending in the ringbuffer, */ -__maybe_unused -static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) +static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); struct console *con = ctxt->console; @@ -852,7 +948,22 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) unsigned long con_dropped; struct nbcon_state cur; unsigned long dropped; - bool done; + unsigned long ulseq; + + /* + * This function should never be called for consoles that have not + * implemented the necessary callback for writing: i.e. legacy + * consoles and, when atomic, nbcon consoles with no write_atomic(). + * Handle it as if ownership was lost and try to continue. + * + * Note that for nbcon consoles the write_thread() callback is + * mandatory and was already checked in nbcon_alloc(). + */ + if (WARN_ON_ONCE((use_atomic && !con->write_atomic) || + !(console_srcu_read_flags(con) & CON_NBCON))) { + nbcon_context_release(ctxt); + return false; + } /* * The printk buffers are filled within an unsafe section. This @@ -878,6 +989,29 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) if (dropped && !is_extended) console_prepend_dropped(&pmsg, dropped); + /* + * If the previous owner was assigned the same record, this context + * has taken over ownership and is replaying the record. Prepend a + * message to let the user know the record is replayed. + */ + ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq)); + if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) { + console_prepend_replay(&pmsg); + } else { + /* + * Ensure this context is still the owner before trying to + * update @nbcon_prev_seq. Otherwise the value in @ulseq may + * not be from the previous owner and instead be some later + * value from the context that took over ownership. + */ + nbcon_state_read(con, &cur); + if (!nbcon_context_can_proceed(ctxt, &cur)) + return false; + + atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq, + __u64seq_to_ulseq(pmsg.seq)); + } + if (!nbcon_context_exit_unsafe(ctxt)) return false; @@ -886,22 +1020,27 @@ static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt) goto update_con; /* Initialize the write context for driver callbacks. */ - wctxt->outbuf = &pmsg.pbufs->outbuf[0]; - wctxt->len = pmsg.outbuf_len; - nbcon_state_read(con, &cur); - wctxt->unsafe_takeover = cur.unsafe_takeover; + nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len); - if (con->write_atomic) { - done = con->write_atomic(con, wctxt); - } else { + if (use_atomic) + con->write_atomic(con, wctxt); + else + con->write_thread(con, wctxt); + + if (!wctxt->outbuf) { + /* + * Ownership was lost and reacquired by the driver. Handle it + * as if ownership was lost. + */ nbcon_context_release(ctxt); - WARN_ON_ONCE(1); - done = false; + return false; } - /* If not done, the emit was aborted. */ - if (!done) - return false; + /* + * Ownership may have been lost but _not_ reacquired by the driver. + * This case is detected and handled when entering unsafe to update + * dropped/seq values. + */ /* * Since any dropped message was successfully output, reset the @@ -928,54 +1067,650 @@ update_con: return nbcon_context_exit_unsafe(ctxt); } +/* + * nbcon_emit_one - Print one record for an nbcon console using the + * specified callback + * @wctxt: An initialized write context struct to use for this context + * @use_atomic: True if the write_atomic() callback is to be used + * + * Return: True, when a record has been printed and there are still + * pending records. The caller might want to continue flushing. + * + * False, when there is no pending record, or when the console + * context cannot be acquired, or the ownership has been lost. + * The caller should give up. Either the job is done, cannot be + * done, or will be handled by the owning context. + * + * This is an internal helper to handle the locking of the console before + * calling nbcon_emit_next_record(). + */ +static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); + struct console *con = ctxt->console; + unsigned long flags; + bool ret = false; + + if (!use_atomic) { + con->device_lock(con, &flags); + + /* + * Ensure this stays on the CPU to make handover and + * takeover possible. + */ + cant_migrate(); + } + + if (!nbcon_context_try_acquire(ctxt)) + goto out; + + /* + * nbcon_emit_next_record() returns false when the console was + * handed over or taken over. In both cases the context is no + * longer valid. + * + * The higher priority printing context takes over responsibility + * to print the pending records. + */ + if (!nbcon_emit_next_record(wctxt, use_atomic)) + goto out; + + nbcon_context_release(ctxt); + + ret = ctxt->backlog; +out: + if (!use_atomic) + con->device_unlock(con, flags); + return ret; +} + /** - * nbcon_alloc - Allocate buffers needed by the nbcon console - * @con: Console to allocate buffers for + * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup + * @con: Console to operate on + * @ctxt: The nbcon context from nbcon_context_try_acquire() * - * Return: True on success. False otherwise and the console cannot - * be used. + * Return: True if the thread should shutdown or if the console is + * allowed to print and a record is available. False otherwise. * - * This is not part of nbcon_init() because buffer allocation must - * be performed earlier in the console registration process. + * After the thread wakes up, it must first check if it should shutdown before + * attempting any printing. */ -bool nbcon_alloc(struct console *con) +static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt) { - if (con->flags & CON_BOOT) { + bool ret = false; + short flags; + int cookie; + + if (kthread_should_stop()) + return true; + + cookie = console_srcu_read_lock(); + + flags = console_srcu_read_flags(con); + if (console_is_usable(con, flags, false)) { + /* Bring the sequence in @ctxt up to date */ + ctxt->seq = nbcon_seq_read(con); + + ret = prb_read_valid(prb, ctxt->seq, NULL); + } + + console_srcu_read_unlock(cookie); + return ret; +} + +/** + * nbcon_kthread_func - The printer thread function + * @__console: Console to operate on + * + * Return: 0 + */ +static int nbcon_kthread_func(void *__console) +{ + struct console *con = __console; + struct nbcon_write_context wctxt = { + .ctxt.console = con, + .ctxt.prio = NBCON_PRIO_NORMAL, + }; + struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); + short con_flags; + bool backlog; + int cookie; + +wait_for_event: + /* + * Guarantee this task is visible on the rcuwait before + * checking the wake condition. + * + * The full memory barrier within set_current_state() of + * ___rcuwait_wait_event() pairs with the full memory + * barrier within rcuwait_has_sleeper(). + * + * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A. + */ + rcuwait_wait_event(&con->rcuwait, + nbcon_kthread_should_wakeup(con, ctxt), + TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */ + + do { + if (kthread_should_stop()) + return 0; + + backlog = false; + /* - * Boot console printing is synchronized with legacy console - * printing, so boot consoles can share the same global printk - * buffers. + * Keep the srcu read lock around the entire operation so that + * synchronize_srcu() can guarantee that the kthread stopped + * or suspended printing. */ - con->pbufs = &printk_shared_pbufs; + cookie = console_srcu_read_lock(); + + con_flags = console_srcu_read_flags(con); + + if (console_is_usable(con, con_flags, false)) + backlog = nbcon_emit_one(&wctxt, false); + + console_srcu_read_unlock(cookie); + + cond_resched(); + + } while (backlog); + + goto wait_for_event; +} + +/** + * nbcon_irq_work - irq work to wake console printer thread + * @irq_work: The irq work to operate on + */ +static void nbcon_irq_work(struct irq_work *irq_work) +{ + struct console *con = container_of(irq_work, struct console, irq_work); + + nbcon_kthread_wake(con); +} + +static inline bool rcuwait_has_sleeper(struct rcuwait *w) +{ + /* + * Guarantee any new records can be seen by tasks preparing to wait + * before this context checks if the rcuwait is empty. + * + * This full memory barrier pairs with the full memory barrier within + * set_current_state() of ___rcuwait_wait_event(), which is called + * after prepare_to_rcuwait() adds the waiter but before it has + * checked the wait condition. + * + * This pairs with nbcon_kthread_func:A. + */ + smp_mb(); /* LMM(rcuwait_has_sleeper:A) */ + return rcuwait_active(w); +} + +/** + * nbcon_kthreads_wake - Wake up printing threads using irq_work + */ +void nbcon_kthreads_wake(void) +{ + struct console *con; + int cookie; + + if (!printk_kthreads_running) + return; + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + if (!(console_srcu_read_flags(con) & CON_NBCON)) + continue; + + /* + * Only schedule irq_work if the printing thread is + * actively waiting. If not waiting, the thread will + * notice by itself that it has work to do. + */ + if (rcuwait_has_sleeper(&con->rcuwait)) + irq_work_queue(&con->irq_work); + } + console_srcu_read_unlock(cookie); +} + +/* + * nbcon_kthread_stop - Stop a console printer thread + * @con: Console to operate on + */ +void nbcon_kthread_stop(struct console *con) +{ + lockdep_assert_console_list_lock_held(); + + if (!con->kthread) + return; + + kthread_stop(con->kthread); + con->kthread = NULL; +} + +/** + * nbcon_kthread_create - Create a console printer thread + * @con: Console to operate on + * + * Return: True if the kthread was started or already exists. + * Otherwise false and @con must not be registered. + * + * This function is called when it will be expected that nbcon consoles are + * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL + * will be no longer flushed by the legacy loop. This is why failure must + * be fatal for console registration. + * + * If @con was already registered and this function fails, @con must be + * unregistered before the global state variable @printk_kthreads_running + * can be set. + */ +bool nbcon_kthread_create(struct console *con) +{ + struct task_struct *kt; + + lockdep_assert_console_list_lock_held(); + + if (con->kthread) + return true; + + kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index); + if (WARN_ON(IS_ERR(kt))) { + con_printk(KERN_ERR, con, "failed to start printing thread\n"); + return false; + } + + con->kthread = kt; + + /* + * It is important that console printing threads are scheduled + * shortly after a printk call and with generous runtime budgets. + */ + sched_set_normal(con->kthread, -20); + + return true; +} + +/* Track the nbcon emergency nesting per CPU. */ +static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting); +static unsigned int early_nbcon_pcpu_emergency_nesting __initdata; + +/** + * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer + * + * Context: For reading, any context. For writing, any context which could + * not be migrated to another CPU. + * Return: Either a pointer to the per CPU emergency nesting counter of + * the current CPU or to the init data during early boot. + * + * The function is safe for reading per-CPU variables in any context because + * preemption is disabled if the current CPU is in the emergency state. See + * also nbcon_cpu_emergency_enter(). + */ +static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void) +{ + /* + * The value of __printk_percpu_data_ready gets set in normal + * context and before SMP initialization. As a result it could + * never change while inside an nbcon emergency section. + */ + if (!printk_percpu_data_ready()) + return &early_nbcon_pcpu_emergency_nesting; + + return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting); +} + +/** + * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon + * printing on the current CPU + * + * Context: Any context. + * Return: The nbcon_prio to use for acquiring an nbcon console in this + * context for printing. + * + * The function is safe for reading per-CPU data in any context because + * preemption is disabled if the current CPU is in the emergency or panic + * state. + */ +enum nbcon_prio nbcon_get_default_prio(void) +{ + unsigned int *cpu_emergency_nesting; + + if (this_cpu_in_panic()) + return NBCON_PRIO_PANIC; + + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); + if (*cpu_emergency_nesting) + return NBCON_PRIO_EMERGENCY; + + return NBCON_PRIO_NORMAL; +} + +/** + * nbcon_legacy_emit_next_record - Print one record for an nbcon console + * in legacy contexts + * @con: The console to print on + * @handover: Will be set to true if a printk waiter has taken over the + * console_lock, in which case the caller is no longer holding + * both the console_lock and the SRCU read lock. Otherwise it + * is set to false. + * @cookie: The cookie from the SRCU read lock. + * @use_atomic: Set true when called in an atomic or unknown context. + * It affects which nbcon callback will be used: write_atomic() + * or write_thread(). + * + * When false, the write_thread() callback is used and would be + * called in a preemtible context unless disabled by the + * device_lock. The legacy handover is not allowed in this mode. + * + * Context: Any context except NMI. + * Return: True, when a record has been printed and there are still + * pending records. The caller might want to continue flushing. + * + * False, when there is no pending record, or when the console + * context cannot be acquired, or the ownership has been lost. + * The caller should give up. Either the job is done, cannot be + * done, or will be handled by the owning context. + * + * This function is meant to be called by console_flush_all() to print records + * on nbcon consoles from legacy context (printing via console unlocking). + * Essentially it is the nbcon version of console_emit_next_record(). + */ +bool nbcon_legacy_emit_next_record(struct console *con, bool *handover, + int cookie, bool use_atomic) +{ + struct nbcon_write_context wctxt = { }; + struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); + unsigned long flags; + bool progress; + + ctxt->console = con; + ctxt->prio = nbcon_get_default_prio(); + + if (use_atomic) { + /* + * In an atomic or unknown context, use the same procedure as + * in console_emit_next_record(). It allows to handover. + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); + stop_critical_timings(); + } + + progress = nbcon_emit_one(&wctxt, use_atomic); + + if (use_atomic) { + start_critical_timings(); + *handover = console_lock_spinning_disable_and_check(cookie); + printk_safe_exit_irqrestore(flags); } else { - con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL); - if (!con->pbufs) { - con_printk(KERN_ERR, con, "failed to allocate printing buffer\n"); - return false; + /* Non-atomic does not perform legacy spinning handovers. */ + *handover = false; + } + + return progress; +} + +/** + * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its + * write_atomic() callback + * @con: The nbcon console to flush + * @stop_seq: Flush up until this record + * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers + * + * Return: 0 if @con was flushed up to @stop_seq Otherwise, error code on + * failure. + * + * Errors: + * + * -EPERM: Unable to acquire console ownership. + * + * -EAGAIN: Another context took over ownership while printing. + * + * -ENOENT: A record before @stop_seq is not available. + * + * If flushing up to @stop_seq was not successful, it only makes sense for the + * caller to try again when -EAGAIN was returned. When -EPERM is returned, + * this context is not allowed to acquire the console. When -ENOENT is + * returned, it cannot be expected that the unfinalized record will become + * available. + */ +static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, + bool allow_unsafe_takeover) +{ + struct nbcon_write_context wctxt = { }; + struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt); + int err = 0; + + ctxt->console = con; + ctxt->spinwait_max_us = 2000; + ctxt->prio = nbcon_get_default_prio(); + ctxt->allow_unsafe_takeover = allow_unsafe_takeover; + + if (!nbcon_context_try_acquire(ctxt)) + return -EPERM; + + while (nbcon_seq_read(con) < stop_seq) { + /* + * nbcon_emit_next_record() returns false when the console was + * handed over or taken over. In both cases the context is no + * longer valid. + */ + if (!nbcon_emit_next_record(&wctxt, true)) + return -EAGAIN; + + if (!ctxt->backlog) { + /* Are there reserved but not yet finalized records? */ + if (nbcon_seq_read(con) < stop_seq) + err = -ENOENT; + break; } } - return true; + nbcon_context_release(ctxt); + return err; +} + +/** + * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its + * write_atomic() callback + * @con: The nbcon console to flush + * @stop_seq: Flush up until this record + * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers + * + * This will stop flushing before @stop_seq if another context has ownership. + * That context is then responsible for the flushing. Likewise, if new records + * are added while this context was flushing and there is no other context + * to handle the printing, this context must also flush those records. + */ +static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, + bool allow_unsafe_takeover) +{ + struct console_flush_type ft; + unsigned long flags; + int err; + +again: + /* + * Atomic flushing does not use console driver synchronization (i.e. + * it does not hold the port lock for uart consoles). Therefore IRQs + * must be disabled to avoid being interrupted and then calling into + * a driver that will deadlock trying to acquire console ownership. + */ + local_irq_save(flags); + + err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + + local_irq_restore(flags); + + /* + * If there was a new owner (-EPERM, -EAGAIN), that context is + * responsible for completing. + * + * Do not wait for records not yet finalized (-ENOENT) to avoid a + * possible deadlock. They will either get flushed by the writer or + * eventually skipped on panic CPU. + */ + if (err) + return; + + /* + * If flushing was successful but more records are available, this + * context must flush those remaining records if the printer thread + * is not available do it. + */ + printk_get_console_flush_type(&ft); + if (!ft.nbcon_offload && + prb_read_valid(prb, nbcon_seq_read(con), NULL)) { + stop_seq = prb_next_reserve_seq(prb); + goto again; + } +} + +/** + * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their + * write_atomic() callback + * @stop_seq: Flush up until this record + * @allow_unsafe_takeover: True, to allow unsafe hostile takeovers + */ +static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover) +{ + struct console *con; + int cookie; + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + + if (!(flags & CON_NBCON)) + continue; + + if (!console_is_usable(con, flags, true)) + continue; + + if (nbcon_seq_read(con) >= stop_seq) + continue; + + nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover); + } + console_srcu_read_unlock(cookie); +} + +/** + * nbcon_atomic_flush_pending - Flush all nbcon consoles using their + * write_atomic() callback + * + * Flush the backlog up through the currently newest record. Any new + * records added while flushing will not be flushed if there is another + * context available to handle the flushing. This is to avoid one CPU + * printing unbounded because other CPUs continue to add records. + */ +void nbcon_atomic_flush_pending(void) +{ + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false); +} + +/** + * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their + * write_atomic() callback and allowing unsafe hostile takeovers + * + * Flush the backlog up through the currently newest record. Unsafe hostile + * takeovers will be performed, if necessary. + */ +void nbcon_atomic_flush_unsafe(void) +{ + __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true); +} + +/** + * nbcon_cpu_emergency_enter - Enter an emergency section where printk() + * messages for that CPU are flushed directly + * + * Context: Any context. Disables preemption. + * + * When within an emergency section, printk() calls will attempt to flush any + * pending messages in the ringbuffer. + */ +void nbcon_cpu_emergency_enter(void) +{ + unsigned int *cpu_emergency_nesting; + + preempt_disable(); + + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); + (*cpu_emergency_nesting)++; +} + +/** + * nbcon_cpu_emergency_exit - Exit an emergency section + * + * Context: Within an emergency section. Enables preemption. + */ +void nbcon_cpu_emergency_exit(void) +{ + unsigned int *cpu_emergency_nesting; + + cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); + + if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) + (*cpu_emergency_nesting)--; + + preempt_enable(); } /** - * nbcon_init - Initialize the nbcon console specific data + * nbcon_alloc - Allocate and init the nbcon console specific data * @con: Console to initialize * - * nbcon_alloc() *must* be called and succeed before this function - * is called. + * Return: True if the console was fully allocated and initialized. + * Otherwise @con must not be registered. * - * This function expects that the legacy @con->seq has been set. + * When allocation and init was successful, the console must be properly + * freed using nbcon_free() once it is no longer needed. */ -void nbcon_init(struct console *con) +bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; - /* nbcon_alloc() must have been called and successful! */ - BUG_ON(!con->pbufs); + /* The write_thread() callback is mandatory. */ + if (WARN_ON(!con->write_thread)) + return false; - nbcon_seq_force(con, con->seq); + rcuwait_init(&con->rcuwait); + init_irq_work(&con->irq_work, nbcon_irq_work); + atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL); nbcon_state_set(con, &state); + + /* + * Initialize @nbcon_seq to the highest possible sequence number so + * that practically speaking it will have nothing to print until a + * desired initial sequence number has been set via nbcon_seq_force(). + */ + atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb)); + + if (con->flags & CON_BOOT) { + /* + * Boot console printing is synchronized with legacy console + * printing, so boot consoles can share the same global printk + * buffers. + */ + con->pbufs = &printk_shared_pbufs; + } else { + con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL); + if (!con->pbufs) { + con_printk(KERN_ERR, con, "failed to allocate printing buffer\n"); + return false; + } + + if (printk_kthreads_running) { + if (!nbcon_kthread_create(con)) { + kfree(con->pbufs); + con->pbufs = NULL; + return false; + } + } + } + + return true; } /** @@ -986,6 +1721,9 @@ void nbcon_free(struct console *con) { struct nbcon_state state = { }; + if (printk_kthreads_running) + nbcon_kthread_stop(con); + nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ @@ -994,3 +1732,85 @@ void nbcon_free(struct console *con) con->pbufs = NULL; } + +/** + * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe + * section + * @con: The nbcon console to acquire + * + * Context: Under the locking mechanism implemented in + * @con->device_lock() including disabling migration. + * Return: True if the console was acquired. False otherwise. + * + * Console drivers will usually use their own internal synchronization + * mechasism to synchronize between console printing and non-printing + * activities (such as setting baud rates). However, nbcon console drivers + * supporting atomic consoles may also want to mark unsafe sections when + * performing non-printing activities in order to synchronize against their + * atomic_write() callback. + * + * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL + * and marks it unsafe for handover/takeover. + */ +bool nbcon_device_try_acquire(struct console *con) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); + + cant_migrate(); + + memset(ctxt, 0, sizeof(*ctxt)); + ctxt->console = con; + ctxt->prio = NBCON_PRIO_NORMAL; + + if (!nbcon_context_try_acquire(ctxt)) + return false; + + if (!nbcon_context_enter_unsafe(ctxt)) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(nbcon_device_try_acquire); + +/** + * nbcon_device_release - Exit unsafe section and release the nbcon console + * @con: The nbcon console acquired in nbcon_device_try_acquire() + */ +void nbcon_device_release(struct console *con) +{ + struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt); + struct console_flush_type ft; + int cookie; + + if (!nbcon_context_exit_unsafe(ctxt)) + return; + + nbcon_context_release(ctxt); + + /* + * This context must flush any new records added while the console + * was locked if the printer thread is not available to do it. The + * console_srcu_read_lock must be taken to ensure the console is + * usable throughout flushing. + */ + cookie = console_srcu_read_lock(); + printk_get_console_flush_type(&ft); + if (console_is_usable(con, console_srcu_read_flags(con), true) && + !ft.nbcon_offload && + prb_read_valid(prb, nbcon_seq_read(con), NULL)) { + /* + * If nbcon_atomic flushing is not available, fallback to + * using the legacy loop. + */ + if (ft.nbcon_atomic) { + __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false); + } else if (ft.legacy_direct) { + if (console_trylock()) + console_unlock(); + } else if (ft.legacy_offload) { + printk_trigger_flush(); + } + } + console_srcu_read_unlock(cookie); +} +EXPORT_SYMBOL_GPL(nbcon_device_release); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c22b07049c38..beb808f4c367 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -34,6 +34,7 @@ #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> +#include <linux/syscore_ops.h> #include <linux/vmcore_info.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> @@ -282,6 +283,7 @@ EXPORT_SYMBOL(console_list_unlock); * Return: A cookie to pass to console_srcu_read_unlock(). */ int console_srcu_read_lock(void) + __acquires(&console_srcu) { return srcu_read_lock_nmisafe(&console_srcu); } @@ -295,6 +297,7 @@ EXPORT_SYMBOL(console_srcu_read_lock); * Counterpart to console_srcu_read_lock() */ void console_srcu_read_unlock(int cookie) + __releases(&console_srcu) { srcu_read_unlock_nmisafe(&console_srcu, cookie); } @@ -461,14 +464,43 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); +/* + * Specifies if a legacy console is registered. If legacy consoles are + * present, it is necessary to perform the console lock/unlock dance + * whenever console flushing should occur. + */ +bool have_legacy_console; + +/* + * Specifies if an nbcon console is registered. If nbcon consoles are present, + * synchronous printing of legacy consoles will not occur during panic until + * the backtrace has been stored to the ringbuffer. + */ +bool have_nbcon_console; + +/* + * Specifies if a boot console is registered. If boot consoles are present, + * nbcon consoles cannot print simultaneously and must be synchronized by + * the console lock. This is because boot consoles and nbcon consoles may + * have mapped the same hardware. + */ +bool have_boot_console; + +/* See printk_legacy_allow_panic_sync() for details. */ +bool legacy_allow_panic_sync; + #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); +static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; +/* True when _all_ printer threads are available for printing. */ +bool printk_kthreads_running; + struct latched_seq { seqcount_latch_t latch; u64 val[2]; @@ -1850,7 +1882,7 @@ static bool console_waiter; * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */ -static void console_lock_spinning_enable(void) +void console_lock_spinning_enable(void) { /* * Do not use spinning in panic(). The panic CPU wants to keep the lock. @@ -1889,7 +1921,7 @@ lockdep: * * Return: 1 if the lock rights were passed, 0 otherwise. */ -static int console_lock_spinning_disable_and_check(int cookie) +int console_lock_spinning_disable_and_check(int cookie) { int waiter; @@ -2300,12 +2332,30 @@ out: return ret; } +/* + * This acts as a one-way switch to allow legacy consoles to print from + * the printk() caller context on a panic CPU. It also attempts to flush + * the legacy consoles in this context. + */ +void printk_legacy_allow_panic_sync(void) +{ + struct console_flush_type ft; + + legacy_allow_panic_sync = true; + + printk_get_console_flush_type(&ft); + if (ft.legacy_direct) { + if (console_trylock()) + console_unlock(); + } +} + asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { + struct console_flush_type ft; int printed_len; - bool in_sched = false; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) @@ -2319,17 +2369,26 @@ asmlinkage int vprintk_emit(int facility, int level, if (other_cpu_in_panic() && !panic_triggering_all_cpu_backtrace) return 0; + printk_get_console_flush_type(&ft); + + /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; - in_sched = true; + ft.legacy_offload |= ft.legacy_direct; + ft.legacy_direct = false; } printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); - /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + + if (ft.legacy_direct) { /* * The caller may be holding system-critical or * timing-sensitive locks. Disable preemption during @@ -2349,7 +2408,7 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } - if (in_sched) + if (ft.legacy_offload) defer_console_output(); else wake_up_klogd(); @@ -2620,6 +2679,7 @@ int match_devname_and_update_preferred_console(const char *devname, return -ENOENT; } +EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console); bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); @@ -2677,6 +2737,7 @@ void suspend_console(void) void resume_console(void) { + struct console_flush_type ft; struct console *con; if (!console_suspend_enabled) @@ -2694,6 +2755,12 @@ void resume_console(void) */ synchronize_srcu(&console_srcu); + printk_get_console_flush_type(&ft); + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + if (ft.legacy_offload) + defer_console_output(); + pr_flush(1000, true); } @@ -2708,10 +2775,16 @@ void resume_console(void) */ static int console_cpu_notify(unsigned int cpu) { + struct console_flush_type ft; + if (!cpuhp_tasks_frozen) { - /* If trylock fails, someone else is doing the printing */ - if (console_trylock()) - console_unlock(); + printk_get_console_flush_type(&ft); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + if (ft.legacy_direct) { + if (console_trylock()) + console_unlock(); + } } return 0; } @@ -2765,36 +2838,6 @@ int is_console_locked(void) } EXPORT_SYMBOL(is_console_locked); -/* - * Check if the given console is currently capable and allowed to print - * records. - * - * Requires the console_srcu_read_lock. - */ -static inline bool console_is_usable(struct console *con) -{ - short flags = console_srcu_read_flags(con); - - if (!(flags & CON_ENABLED)) - return false; - - if ((flags & CON_SUSPENDED)) - return false; - - if (!con->write) - return false; - - /* - * Console drivers may assume that per-cpu resources have been - * allocated. So unless they're explicitly marked as being able to - * cope (CON_ANYTIME) don't call them until this CPU is officially up. - */ - if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) - return false; - - return true; -} - static void __console_unlock(void) { console_locked = 0; @@ -2804,30 +2847,31 @@ static void __console_unlock(void) #ifdef CONFIG_PRINTK /* - * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This - * is achieved by shifting the existing message over and inserting the dropped - * message. + * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting + * the existing message over and inserting the scratchbuf message. * - * @pmsg is the printk message to prepend. - * - * @dropped is the dropped count to report in the dropped message. + * @pmsg is the original printk message. + * @fmt is the printf format of the message which will prepend the existing one. * - * If the message text in @pmsg->pbufs->outbuf does not have enough space for - * the dropped message, the message text will be sufficiently truncated. + * If there is not enough space in @pmsg->pbufs->outbuf, the existing + * message text will be sufficiently truncated. * * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ -void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) +__printf(2, 3) +static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; + va_list args; size_t len; - len = scnprintf(scratchbuf, scratchbuf_sz, - "** %lu printk messages dropped **\n", dropped); + va_start(args, fmt); + len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args); + va_end(args); /* * Make sure outbuf is sufficiently large before prepending. @@ -2850,6 +2894,30 @@ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) } /* + * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". + * @pmsg->outbuf_len is updated appropriately. + * + * @pmsg is the printk message to prepend. + * + * @dropped is the dropped count to report in the dropped message. + */ +void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) +{ + console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped); +} + +/* + * Prepend the message in @pmsg->pbufs->outbuf with a "replay message". + * @pmsg->outbuf_len is updated appropriately. + * + * @pmsg is the printk message to prepend. + */ +void console_prepend_replay(struct printk_message *pmsg) +{ + console_prepend_message(pmsg, "** replaying previous printk message **\n"); +} + +/* * Read and format the specified record (or a later record if the specified * record is not available). * @@ -2915,6 +2983,34 @@ out: } /* + * Legacy console printing from printk() caller context does not respect + * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a + * false positive. For PREEMPT_RT the false positive condition does not + * occur. + * + * This map is used to temporarily establish LD_WAIT_SLEEP context for the + * console write() callback when legacy printing to avoid false positive + * lockdep complaints, thus allowing lockdep to continue to function for + * real issues. + */ +#ifdef CONFIG_PREEMPT_RT +static inline void printk_legacy_allow_spinlock_enter(void) { } +static inline void printk_legacy_allow_spinlock_exit(void) { } +#else +static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); + +static inline void printk_legacy_allow_spinlock_enter(void) +{ + lock_map_acquire_try(&printk_legacy_map); +} + +static inline void printk_legacy_allow_spinlock_exit(void) +{ + lock_map_release(&printk_legacy_map); +} +#endif /* CONFIG_PREEMPT_RT */ + +/* * Used as the printk buffers for non-panic, serialized console printing. * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. * Its usage requires the console_lock held. @@ -2963,31 +3059,46 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co con->dropped = 0; } - /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - * - * Interrupts are disabled because the hand over to a waiter - * must not be interrupted until the hand over is completed - * (@console_waiter is cleared). - */ - printk_safe_enter_irqsave(flags); - console_lock_spinning_enable(); + /* Write everything out to the hardware. */ - /* Do not trace print latency. */ - stop_critical_timings(); + if (force_legacy_kthread() && !panic_in_progress()) { + /* + * With forced threading this function is in a task context + * (either legacy kthread or get_init_console_seq()). There + * is no need for concern about printk reentrance, handovers, + * or lockdep complaints. + */ - /* Write everything out to the hardware. */ - con->write(con, outbuf, pmsg.outbuf_len); + con->write(con, outbuf, pmsg.outbuf_len); + con->seq = pmsg.seq + 1; + } else { + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + * + * Interrupts are disabled because the hand over to a waiter + * must not be interrupted until the hand over is completed + * (@console_waiter is cleared). + */ + printk_safe_enter_irqsave(flags); + console_lock_spinning_enable(); - start_critical_timings(); + /* Do not trace print latency. */ + stop_critical_timings(); - con->seq = pmsg.seq + 1; + printk_legacy_allow_spinlock_enter(); + con->write(con, outbuf, pmsg.outbuf_len); + printk_legacy_allow_spinlock_exit(); - *handover = console_lock_spinning_disable_and_check(cookie); - printk_safe_exit_irqrestore(flags); + start_critical_timings(); + + con->seq = pmsg.seq + 1; + + *handover = console_lock_spinning_disable_and_check(cookie); + printk_safe_exit_irqrestore(flags); + } skip: return true; } @@ -3000,6 +3111,8 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co return false; } +static inline void printk_kthreads_check_locked(void) { } + #endif /* CONFIG_PRINTK */ /* @@ -3027,6 +3140,7 @@ static bool console_emit_next_record(struct console *con, bool *handover, int co */ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) { + struct console_flush_type ft; bool any_usable = false; struct console *con; bool any_progress; @@ -3038,15 +3152,34 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove do { any_progress = false; + printk_get_console_flush_type(&ft); + cookie = console_srcu_read_lock(); for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + u64 printk_seq; bool progress; - if (!console_is_usable(con)) + /* + * console_flush_all() is only responsible for nbcon + * consoles when the nbcon consoles cannot print via + * their atomic or threaded flushing. + */ + if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) + continue; + + if (!console_is_usable(con, flags, !do_cond_resched)) continue; any_usable = true; - progress = console_emit_next_record(con, handover, cookie); + if (flags & CON_NBCON) { + progress = nbcon_legacy_emit_next_record(con, handover, cookie, + !do_cond_resched); + printk_seq = nbcon_seq_read(con); + } else { + progress = console_emit_next_record(con, handover, cookie); + printk_seq = con->seq; + } /* * If a handover has occurred, the SRCU read lock @@ -3056,8 +3189,8 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove return false; /* Track the next of the highest seq flushed. */ - if (con->seq > *next_seq) - *next_seq = con->seq; + if (printk_seq > *next_seq) + *next_seq = printk_seq; if (!progress) continue; @@ -3080,19 +3213,7 @@ abandon: return false; } -/** - * console_unlock - unblock the console subsystem from printing - * - * Releases the console_lock which the caller holds to block printing of - * the console subsystem. - * - * While the console_lock was held, console output may have been buffered - * by printk(). If this is the case, console_unlock(); emits - * the output prior to releasing the lock. - * - * console_unlock(); may be called from any context. - */ -void console_unlock(void) +static void __console_flush_and_unlock(void) { bool do_cond_resched; bool handover; @@ -3136,6 +3257,29 @@ void console_unlock(void) */ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } + +/** + * console_unlock - unblock the legacy console subsystem from printing + * + * Releases the console_lock which the caller holds to block printing of + * the legacy console subsystem. + * + * While the console_lock was held, console output may have been buffered + * by printk(). If this is the case, console_unlock() emits the output on + * legacy consoles prior to releasing the lock. + * + * console_unlock(); may be called from any context. + */ +void console_unlock(void) +{ + struct console_flush_type ft; + + printk_get_console_flush_type(&ft); + if (ft.legacy_direct) + __console_flush_and_unlock(); + else + __console_unlock(); +} EXPORT_SYMBOL(console_unlock); /** @@ -3258,6 +3402,7 @@ static void __console_rewind_all(void) */ void console_flush_on_panic(enum con_flush_mode mode) { + struct console_flush_type ft; bool handover; u64 next_seq; @@ -3281,7 +3426,13 @@ void console_flush_on_panic(enum con_flush_mode mode) if (mode == CONSOLE_REPLAY_ALL) __console_rewind_all(); - console_flush_all(false, &next_seq, &handover); + printk_get_console_flush_type(&ft); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + + /* Flush legacy consoles once allowed, even when dangerous. */ + if (legacy_allow_panic_sync) + console_flush_all(false, &next_seq, &handover); } /* @@ -3338,13 +3489,236 @@ EXPORT_SYMBOL(console_stop); void console_start(struct console *console) { + struct console_flush_type ft; + bool is_nbcon; + console_list_lock(); console_srcu_write_flags(console, console->flags | CON_ENABLED); + is_nbcon = console->flags & CON_NBCON; console_list_unlock(); + + /* + * Ensure that all SRCU list walks have completed. The related + * printing context must be able to see it is enabled so that + * it is guaranteed to wake up and resume printing. + */ + synchronize_srcu(&console_srcu); + + printk_get_console_flush_type(&ft); + if (is_nbcon && ft.nbcon_offload) + nbcon_kthread_wake(console); + else if (ft.legacy_offload) + defer_console_output(); + __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_start); +#ifdef CONFIG_PRINTK +static int unregister_console_locked(struct console *console); + +/* True when system boot is far enough to create printer threads. */ +static bool printk_kthreads_ready __ro_after_init; + +static struct task_struct *printk_legacy_kthread; + +static bool legacy_kthread_should_wakeup(void) +{ + struct console_flush_type ft; + struct console *con; + bool ret = false; + int cookie; + + if (kthread_should_stop()) + return true; + + printk_get_console_flush_type(&ft); + + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { + short flags = console_srcu_read_flags(con); + u64 printk_seq; + + /* + * The legacy printer thread is only responsible for nbcon + * consoles when the nbcon consoles cannot print via their + * atomic or threaded flushing. + */ + if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) + continue; + + if (!console_is_usable(con, flags, false)) + continue; + + if (flags & CON_NBCON) { + printk_seq = nbcon_seq_read(con); + } else { + /* + * It is safe to read @seq because only this + * thread context updates @seq. + */ + printk_seq = con->seq; + } + + if (prb_read_valid(prb, printk_seq, NULL)) { + ret = true; + break; + } + } + console_srcu_read_unlock(cookie); + + return ret; +} + +static int legacy_kthread_func(void *unused) +{ + for (;;) { + wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); + + if (kthread_should_stop()) + break; + + console_lock(); + __console_flush_and_unlock(); + } + + return 0; +} + +static bool legacy_kthread_create(void) +{ + struct task_struct *kt; + + lockdep_assert_console_list_lock_held(); + + kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy"); + if (WARN_ON(IS_ERR(kt))) { + pr_err("failed to start legacy printing thread\n"); + return false; + } + + printk_legacy_kthread = kt; + + /* + * It is important that console printing threads are scheduled + * shortly after a printk call and with generous runtime budgets. + */ + sched_set_normal(printk_legacy_kthread, -20); + + return true; +} + +/** + * printk_kthreads_shutdown - shutdown all threaded printers + * + * On system shutdown all threaded printers are stopped. This allows printk + * to transition back to atomic printing, thus providing a robust mechanism + * for the final shutdown/reboot messages to be output. + */ +static void printk_kthreads_shutdown(void) +{ + struct console *con; + + console_list_lock(); + if (printk_kthreads_running) { + printk_kthreads_running = false; + + for_each_console(con) { + if (con->flags & CON_NBCON) + nbcon_kthread_stop(con); + } + + /* + * The threads may have been stopped while printing a + * backlog. Flush any records left over. + */ + nbcon_atomic_flush_pending(); + } + console_list_unlock(); +} + +static struct syscore_ops printk_syscore_ops = { + .shutdown = printk_kthreads_shutdown, +}; + +/* + * If appropriate, start nbcon kthreads and set @printk_kthreads_running. + * If any kthreads fail to start, those consoles are unregistered. + * + * Must be called under console_list_lock(). + */ +static void printk_kthreads_check_locked(void) +{ + struct hlist_node *tmp; + struct console *con; + + lockdep_assert_console_list_lock_held(); + + if (!printk_kthreads_ready) + return; + + if (have_legacy_console || have_boot_console) { + if (!printk_legacy_kthread && + force_legacy_kthread() && + !legacy_kthread_create()) { + /* + * All legacy consoles must be unregistered. If there + * are any nbcon consoles, they will set up their own + * kthread. + */ + hlist_for_each_entry_safe(con, tmp, &console_list, node) { + if (con->flags & CON_NBCON) + continue; + + unregister_console_locked(con); + } + } + } else if (printk_legacy_kthread) { + kthread_stop(printk_legacy_kthread); + printk_legacy_kthread = NULL; + } + + /* + * Printer threads cannot be started as long as any boot console is + * registered because there is no way to synchronize the hardware + * registers between boot console code and regular console code. + * It can only be known that there will be no new boot consoles when + * an nbcon console is registered. + */ + if (have_boot_console || !have_nbcon_console) { + /* Clear flag in case all nbcon consoles unregistered. */ + printk_kthreads_running = false; + return; + } + + if (printk_kthreads_running) + return; + + hlist_for_each_entry_safe(con, tmp, &console_list, node) { + if (!(con->flags & CON_NBCON)) + continue; + + if (!nbcon_kthread_create(con)) + unregister_console_locked(con); + } + + printk_kthreads_running = true; +} + +static int __init printk_set_kthreads_ready(void) +{ + register_syscore_ops(&printk_syscore_ops); + + console_list_lock(); + printk_kthreads_ready = true; + printk_kthreads_check_locked(); + console_list_unlock(); + + return 0; +} +early_initcall(printk_set_kthreads_ready); +#endif /* CONFIG_PRINTK */ + static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) @@ -3446,19 +3820,21 @@ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_CONSDEV; } -static void console_init_seq(struct console *newcon, bool bootcon_registered) +/* Return the starting sequence number for a newly registered console. */ +static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered) { struct console *con; bool handover; + u64 init_seq; if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); - newcon->seq = syslog_seq; + init_seq = syslog_seq; mutex_unlock(&syslog_lock); } else { /* Begin with next message added to ringbuffer. */ - newcon->seq = prb_next_seq(prb); + init_seq = prb_next_seq(prb); /* * If any enabled boot consoles are due to be unregistered @@ -3479,7 +3855,7 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered) * Flush all consoles and set the console to start at * the next unprinted sequence number. */ - if (!console_flush_all(true, &newcon->seq, &handover)) { + if (!console_flush_all(true, &init_seq, &handover)) { /* * Flushing failed. Just choose the lowest * sequence of the enabled boot consoles. @@ -3492,19 +3868,30 @@ static void console_init_seq(struct console *newcon, bool bootcon_registered) if (handover) console_lock(); - newcon->seq = prb_next_seq(prb); + init_seq = prb_next_seq(prb); for_each_console(con) { - if ((con->flags & CON_BOOT) && - (con->flags & CON_ENABLED) && - con->seq < newcon->seq) { - newcon->seq = con->seq; + u64 seq; + + if (!(con->flags & CON_BOOT) || + !(con->flags & CON_ENABLED)) { + continue; } + + if (con->flags & CON_NBCON) + seq = nbcon_seq_read(con); + else + seq = con->seq; + + if (seq < init_seq) + init_seq = seq; } } console_unlock(); } } + + return init_seq; } #define console_first() \ @@ -3533,9 +3920,12 @@ static int unregister_console_locked(struct console *console); */ void register_console(struct console *newcon) { - struct console *con; + bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic; bool bootcon_registered = false; bool realcon_registered = false; + struct console *con; + unsigned long flags; + u64 init_seq; int err; console_list_lock(); @@ -3613,10 +4003,31 @@ void register_console(struct console *newcon) } newcon->dropped = 0; - console_init_seq(newcon, bootcon_registered); + init_seq = get_init_console_seq(newcon, bootcon_registered); + + if (newcon->flags & CON_NBCON) { + have_nbcon_console = true; + nbcon_seq_force(newcon, init_seq); + } else { + have_legacy_console = true; + newcon->seq = init_seq; + } + + if (newcon->flags & CON_BOOT) + have_boot_console = true; - if (newcon->flags & CON_NBCON) - nbcon_init(newcon); + /* + * If another context is actively using the hardware of this new + * console, it will not be aware of the nbcon synchronization. This + * is a risk that two contexts could access the hardware + * simultaneously if this new console is used for atomic printing + * and the other context is still using the hardware. + * + * Use the driver synchronization to ensure that the hardware is not + * in use while this new console transitions to being registered. + */ + if (use_device_lock) + newcon->device_lock(newcon, &flags); /* * Put this console in the list - keep the @@ -3642,6 +4053,10 @@ void register_console(struct console *newcon) * register_console() completes. */ + /* This new console is now registered. */ + if (use_device_lock) + newcon->device_unlock(newcon, flags); + console_sysfs_notify(); /* @@ -3662,6 +4077,9 @@ void register_console(struct console *newcon) unregister_console_locked(con); } } + + /* Changed console list, may require printer threads to start/stop. */ + printk_kthreads_check_locked(); unlock: console_list_unlock(); } @@ -3670,6 +4088,12 @@ EXPORT_SYMBOL(register_console); /* Must be called under console_list_lock(). */ static int unregister_console_locked(struct console *console) { + bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic; + bool found_legacy_con = false; + bool found_nbcon_con = false; + bool found_boot_con = false; + unsigned long flags; + struct console *c; int res; lockdep_assert_console_list_lock_held(); @@ -3682,14 +4106,29 @@ static int unregister_console_locked(struct console *console) if (res > 0) return 0; + if (!console_is_registered_locked(console)) + res = -ENODEV; + else if (console_is_usable(console, console->flags, true)) + __pr_flush(console, 1000, true); + /* Disable it unconditionally */ console_srcu_write_flags(console, console->flags & ~CON_ENABLED); - if (!console_is_registered_locked(console)) - return -ENODEV; + if (res < 0) + return res; + + /* + * Use the driver synchronization to ensure that the hardware is not + * in use while this console transitions to being unregistered. + */ + if (use_device_lock) + console->device_lock(console, &flags); hlist_del_init_rcu(&console->node); + if (use_device_lock) + console->device_unlock(console, flags); + /* * <HISTORICAL> * If this isn't the last console and it has CON_CONSDEV set, we @@ -3717,6 +4156,29 @@ static int unregister_console_locked(struct console *console) if (console->exit) res = console->exit(console); + /* + * With this console gone, the global flags tracking registered + * console types may have changed. Update them. + */ + for_each_console(c) { + if (c->flags & CON_BOOT) + found_boot_con = true; + + if (c->flags & CON_NBCON) + found_nbcon_con = true; + else + found_legacy_con = true; + } + if (!found_boot_con) + have_boot_console = found_boot_con; + if (!found_legacy_con) + have_legacy_console = found_legacy_con; + if (!found_nbcon_con) + have_nbcon_console = found_nbcon_con; + + /* Changed console list, may require printer threads to start/stop. */ + printk_kthreads_check_locked(); + return res; } @@ -3863,6 +4325,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre { unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms); unsigned long remaining_jiffies = timeout_jiffies; + struct console_flush_type ft; struct console *c; u64 last_diff = 0; u64 printk_seq; @@ -3871,13 +4334,22 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre u64 diff; u64 seq; + /* Sorry, pr_flush() will not work this early. */ + if (system_state < SYSTEM_SCHEDULING) + return false; + might_sleep(); seq = prb_next_reserve_seq(prb); /* Flush the consoles so that records up to @seq are printed. */ - console_lock(); - console_unlock(); + printk_get_console_flush_type(&ft); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + if (ft.legacy_direct) { + console_lock(); + console_unlock(); + } for (;;) { unsigned long begin_jiffies; @@ -3890,6 +4362,12 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * console->seq. Releasing console_lock flushes more * records in case @seq is still not printed on all * usable consoles. + * + * Holding the console_lock is not necessary if there + * are no legacy or boot consoles. However, such a + * console could register at any time. Always hold the + * console_lock as a precaution rather than + * synchronizing against register_console(). */ console_lock(); @@ -3905,8 +4383,10 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * that they make forward progress, so only increment * @diff for usable consoles. */ - if (!console_is_usable(c)) + if (!console_is_usable(c, flags, true) && + !console_is_usable(c, flags, false)) { continue; + } if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(c); @@ -3974,9 +4454,13 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { - /* If trylock fails, someone else is doing the printing */ - if (console_trylock()) - console_unlock(); + if (force_legacy_kthread()) { + if (printk_legacy_kthread) + wake_up_interruptible(&legacy_wait); + } else { + if (console_trylock()) + console_unlock(); + } } if (pending & PRINTK_PENDING_WAKEUP) @@ -4184,16 +4668,21 @@ const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); /** - * kmsg_dump - dump kernel log to kernel message dumpers. + * kmsg_dump_desc - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping + * @desc: a short string to describe what caused the panic or oops. Can be NULL + * if no additional description is available. * * Call each of the registered dumper's dump() callback, which can * retrieve the kmsg records with kmsg_dump_get_line() or * kmsg_dump_get_buffer(). */ -void kmsg_dump(enum kmsg_dump_reason reason) +void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc) { struct kmsg_dumper *dumper; + struct kmsg_dump_detail detail = { + .reason = reason, + .description = desc}; rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { @@ -4211,7 +4700,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) continue; /* invoke dumper which will iterate over records */ - dumper->dump(dumper, reason); + dumper->dump(dumper, &detail); } rcu_read_unlock(); } @@ -4382,8 +4871,17 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); */ void console_try_replay_all(void) { + struct console_flush_type ft; + + printk_get_console_flush_type(&ft); if (console_trylock()) { __console_rewind_all(); + if (ft.nbcon_atomic) + nbcon_atomic_flush_pending(); + if (ft.nbcon_offload) + nbcon_kthreads_wake(); + if (ft.legacy_offload) + defer_console_output(); /* Consoles are flushed as part of console_unlock(). */ console_unlock(); } diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h index 52626d0f1fa3..4ef81349d9fb 100644 --- a/kernel/printk/printk_ringbuffer.h +++ b/kernel/printk/printk_ringbuffer.h @@ -4,7 +4,10 @@ #define _KERNEL_PRINTK_RINGBUFFER_H #include <linux/atomic.h> +#include <linux/bits.h> #include <linux/dev_printk.h> +#include <linux/stddef.h> +#include <linux/types.h> /* * Meta information about each stored message. @@ -120,7 +123,7 @@ enum desc_state { #define _DATA_SIZE(sz_bits) (1UL << (sz_bits)) #define _DESCS_COUNT(ct_bits) (1U << (ct_bits)) -#define DESC_SV_BITS (sizeof(unsigned long) * 8) +#define DESC_SV_BITS BITS_PER_LONG #define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2) #define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT) #define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT)) @@ -401,10 +404,12 @@ u64 prb_next_reserve_seq(struct printk_ringbuffer *rb); #define __u64seq_to_ulseq(u64seq) (u64seq) #define __ulseq_to_u64seq(rb, ulseq) (ulseq) +#define ULSEQ_MAX(rb) (-1) #else /* CONFIG_64BIT */ #define __u64seq_to_ulseq(u64seq) ((u32)u64seq) +#define ULSEQ_MAX(rb) __u64seq_to_ulseq(prb_first_seq(rb) + 0x80000000UL) static inline u64 __ulseq_to_u64seq(struct printk_ringbuffer *rb, u32 ulseq) { diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 6d10927a07d8..2b35a9d3919d 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -26,6 +26,29 @@ void __printk_safe_exit(void) this_cpu_dec(printk_context); } +void __printk_deferred_enter(void) +{ + cant_migrate(); + __printk_safe_enter(); +} + +void __printk_deferred_exit(void) +{ + cant_migrate(); + __printk_safe_exit(); +} + +bool is_printk_legacy_deferred(void) +{ + /* + * The per-CPU variable @printk_context can be read safely in any + * context. CPU migration is always disabled when set. + */ + return (force_legacy_kthread() || + this_cpu_read(printk_context) || + in_nmi()); +} + asmlinkage int vprintk(const char *fmt, va_list args) { #ifdef CONFIG_KGDB_KDB @@ -38,7 +61,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Use the main logbuf even in NMI. But avoid calling console * drivers that might have their own locks. */ - if (this_cpu_read(printk_context) || in_nmi()) + if (is_printk_legacy_deferred()) return vprintk_deferred(fmt, args); /* No obstacles. */ diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 38238e595a61..feb3ac1dc5d5 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -54,9 +54,6 @@ * grace-period sequence number. */ -#define RCU_SEQ_CTR_SHIFT 2 -#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) - /* Low-order bit definition for polled grace-period APIs. */ #define RCU_GET_STATE_COMPLETED 0x1 @@ -255,6 +252,11 @@ static inline void debug_rcu_head_callback(struct rcu_head *rhp) kmem_dump_obj(rhp); } +static inline bool rcu_barrier_cb_is_done(struct rcu_head *rhp) +{ + return rhp->next == rhp; +} + extern int rcu_cpu_stall_suppress_at_boot; static inline bool rcu_stall_is_suppressed_at_boot(void) @@ -606,7 +608,7 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, #endif #ifdef CONFIG_TINY_RCU -static inline bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) { return false; } +static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; } static inline unsigned long rcu_get_gp_seq(void) { return 0; } static inline unsigned long rcu_exp_batches_completed(void) { return 0; } static inline unsigned long @@ -619,7 +621,7 @@ static inline void rcu_fwd_progress_check(unsigned long j) { } static inline void rcu_gp_slow_register(atomic_t *rgssp) { } static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { } #else /* #ifdef CONFIG_TINY_RCU */ -bool rcu_dynticks_zero_in_eqs(int cpu, int *vp); +bool rcu_watching_zero_in_eqs(int cpu, int *vp); unsigned long rcu_get_gp_seq(void); unsigned long rcu_exp_batches_completed(void); unsigned long srcu_batches_completed(struct srcu_struct *sp); diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 1693ea22ef1b..298a2c573f02 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -261,17 +261,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Mark the specified rcu_segcblist structure as offloaded (or not) - */ -void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) -{ - if (offload) - rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED); - else - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); -} - -/* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? */ diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 4fe877f5f654..259904075636 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -89,16 +89,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING)) - return true; - - return false; -} - -static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) -{ - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE)) + rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED)) return true; return false; diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index b53a9e8f5904..6d37596deb1f 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -39,6 +39,7 @@ #include <linux/torture.h> #include <linux/vmalloc.h> #include <linux/rcupdate_trace.h> +#include <linux/sched/debug.h> #include "rcu.h" @@ -104,6 +105,20 @@ static char *scale_type = "rcu"; module_param(scale_type, charp, 0444); MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)"); +// Structure definitions for custom fixed-per-task allocator. +struct writer_mblock { + struct rcu_head wmb_rh; + struct llist_node wmb_node; + struct writer_freelist *wmb_wfl; +}; + +struct writer_freelist { + struct llist_head ws_lhg; + atomic_t ws_inflight; + struct llist_head ____cacheline_internodealigned_in_smp ws_lhp; + struct writer_mblock *ws_mblocks; +}; + static int nrealreaders; static int nrealwriters; static struct task_struct **writer_tasks; @@ -111,6 +126,8 @@ static struct task_struct **reader_tasks; static struct task_struct *shutdown_task; static u64 **writer_durations; +static bool *writer_done; +static struct writer_freelist *writer_freelists; static int *writer_n_durations; static atomic_t n_rcu_scale_reader_started; static atomic_t n_rcu_scale_writer_started; @@ -120,7 +137,6 @@ static u64 t_rcu_scale_writer_started; static u64 t_rcu_scale_writer_finished; static unsigned long b_rcu_gp_test_started; static unsigned long b_rcu_gp_test_finished; -static DEFINE_PER_CPU(atomic_t, n_async_inflight); #define MAX_MEAS 10000 #define MIN_MEAS 100 @@ -143,6 +159,7 @@ struct rcu_scale_ops { void (*sync)(void); void (*exp_sync)(void); struct task_struct *(*rso_gp_kthread)(void); + void (*stats)(void); const char *name; }; @@ -224,6 +241,11 @@ static void srcu_scale_synchronize(void) synchronize_srcu(srcu_ctlp); } +static void srcu_scale_stats(void) +{ + srcu_torture_stats_print(srcu_ctlp, scale_type, SCALE_FLAG); +} + static void srcu_scale_synchronize_expedited(void) { synchronize_srcu_expedited(srcu_ctlp); @@ -241,6 +263,7 @@ static struct rcu_scale_ops srcu_ops = { .gp_barrier = srcu_rcu_barrier, .sync = srcu_scale_synchronize, .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, .name = "srcu" }; @@ -270,6 +293,7 @@ static struct rcu_scale_ops srcud_ops = { .gp_barrier = srcu_rcu_barrier, .sync = srcu_scale_synchronize, .exp_sync = srcu_scale_synchronize_expedited, + .stats = srcu_scale_stats, .name = "srcud" }; @@ -288,6 +312,11 @@ static void tasks_scale_read_unlock(int idx) { } +static void rcu_tasks_scale_stats(void) +{ + rcu_tasks_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -300,6 +329,7 @@ static struct rcu_scale_ops tasks_ops = { .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, .rso_gp_kthread = get_rcu_tasks_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_scale_stats, .name = "tasks" }; @@ -326,6 +356,11 @@ static void tasks_rude_scale_read_unlock(int idx) { } +static void rcu_tasks_rude_scale_stats(void) +{ + rcu_tasks_rude_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_rude_ops = { .ptype = RCU_TASKS_RUDE_FLAVOR, .init = rcu_sync_scale_init, @@ -333,11 +368,10 @@ static struct rcu_scale_ops tasks_rude_ops = { .readunlock = tasks_rude_scale_read_unlock, .get_gp_seq = rcu_no_completed, .gp_diff = rcu_seq_diff, - .async = call_rcu_tasks_rude, - .gp_barrier = rcu_barrier_tasks_rude, .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_rude_scale_stats, .name = "tasks-rude" }; @@ -366,6 +400,11 @@ static void tasks_trace_scale_read_unlock(int idx) rcu_read_unlock_trace(); } +static void rcu_tasks_trace_scale_stats(void) +{ + rcu_tasks_trace_torture_stats_print(scale_type, SCALE_FLAG); +} + static struct rcu_scale_ops tasks_tracing_ops = { .ptype = RCU_TASKS_FLAVOR, .init = rcu_sync_scale_init, @@ -378,6 +417,7 @@ static struct rcu_scale_ops tasks_tracing_ops = { .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, + .stats = IS_ENABLED(CONFIG_TINY_RCU) ? NULL : rcu_tasks_trace_scale_stats, .name = "tasks-tracing" }; @@ -438,12 +478,52 @@ rcu_scale_reader(void *arg) } /* + * Allocate a writer_mblock structure for the specified rcu_scale_writer + * task. + */ +static struct writer_mblock *rcu_scale_alloc(long me) +{ + struct llist_node *llnp; + struct writer_freelist *wflp; + struct writer_mblock *wmbp; + + if (WARN_ON_ONCE(!writer_freelists)) + return NULL; + wflp = &writer_freelists[me]; + if (llist_empty(&wflp->ws_lhp)) { + // ->ws_lhp is private to its rcu_scale_writer task. + wmbp = container_of(llist_del_all(&wflp->ws_lhg), struct writer_mblock, wmb_node); + wflp->ws_lhp.first = &wmbp->wmb_node; + } + llnp = llist_del_first(&wflp->ws_lhp); + if (!llnp) + return NULL; + return container_of(llnp, struct writer_mblock, wmb_node); +} + +/* + * Free a writer_mblock structure to its rcu_scale_writer task. + */ +static void rcu_scale_free(struct writer_mblock *wmbp) +{ + struct writer_freelist *wflp; + + if (!wmbp) + return; + wflp = wmbp->wmb_wfl; + llist_add(&wmbp->wmb_node, &wflp->ws_lhg); +} + +/* * Callback function for asynchronous grace periods from rcu_scale_writer(). */ static void rcu_scale_async_cb(struct rcu_head *rhp) { - atomic_dec(this_cpu_ptr(&n_async_inflight)); - kfree(rhp); + struct writer_mblock *wmbp = container_of(rhp, struct writer_mblock, wmb_rh); + struct writer_freelist *wflp = wmbp->wmb_wfl; + + atomic_dec(&wflp->ws_inflight); + rcu_scale_free(wmbp); } /* @@ -456,12 +536,14 @@ rcu_scale_writer(void *arg) int i_max; unsigned long jdone; long me = (long)arg; - struct rcu_head *rhp = NULL; + bool selfreport = false; bool started = false, done = false, alldone = false; u64 t; DEFINE_TORTURE_RANDOM(tr); u64 *wdp; u64 *wdpp = writer_durations[me]; + struct writer_freelist *wflp = &writer_freelists[me]; + struct writer_mblock *wmbp = NULL; VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started"); WARN_ON(!wdpp); @@ -493,30 +575,34 @@ rcu_scale_writer(void *arg) jdone = jiffies + minruntime * HZ; do { + bool gp_succeeded = false; + if (writer_holdoff) udelay(writer_holdoff); if (writer_holdoff_jiffies) schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); - if (gp_async) { -retry: - if (!rhp) - rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); - if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) { - atomic_inc(this_cpu_ptr(&n_async_inflight)); - cur_ops->async(rhp, rcu_scale_async_cb); - rhp = NULL; + if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) { + if (!wmbp) + wmbp = rcu_scale_alloc(me); + if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) { + atomic_inc(&wflp->ws_inflight); + cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb); + wmbp = NULL; + gp_succeeded = true; } else if (!kthread_should_stop()) { cur_ops->gp_barrier(); - goto retry; } else { - kfree(rhp); /* Because we are stopping. */ + rcu_scale_free(wmbp); /* Because we are stopping. */ + wmbp = NULL; } } else if (gp_exp) { cur_ops->exp_sync(); + gp_succeeded = true; } else { cur_ops->sync(); + gp_succeeded = true; } t = ktime_get_mono_fast_ns(); *wdp = t - *wdp; @@ -526,6 +612,7 @@ retry: started = true; if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) { done = true; + WRITE_ONCE(writer_done[me], true); sched_set_normal(current, 0); pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", scale_type, SCALE_FLAG, me, MIN_MEAS); @@ -551,11 +638,32 @@ retry: if (done && !alldone && atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters) alldone = true; - if (started && !alldone && i < MAX_MEAS - 1) + if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) { + static atomic_t dumped; + int i; + + if (!atomic_xchg(&dumped, 1)) { + for (i = 0; i < nrealwriters; i++) { + if (writer_done[i]) + continue; + pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i); + sched_show_task(writer_tasks[i]); + } + if (cur_ops->stats) + cur_ops->stats(); + } + } + if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) { + pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n", + __func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone); + selfreport = true; + } + if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1) i++; rcu_scale_wait_shutdown(); } while (!torture_must_stop()); - if (gp_async) { + if (gp_async && cur_ops->async) { + rcu_scale_free(wmbp); cur_ops->gp_barrier(); } writer_n_durations[me] = i_max + 1; @@ -713,6 +821,7 @@ kfree_scale_cleanup(void) torture_stop_kthread(kfree_scale_thread, kfree_reader_tasks[i]); kfree(kfree_reader_tasks); + kfree_reader_tasks = NULL; } torture_cleanup_end(); @@ -881,6 +990,7 @@ rcu_scale_cleanup(void) torture_stop_kthread(rcu_scale_reader, reader_tasks[i]); kfree(reader_tasks); + reader_tasks = NULL; } if (writer_tasks) { @@ -919,10 +1029,33 @@ rcu_scale_cleanup(void) schedule_timeout_uninterruptible(1); } kfree(writer_durations[i]); + if (writer_freelists) { + int ctr = 0; + struct llist_node *llnp; + struct writer_freelist *wflp = &writer_freelists[i]; + + if (wflp->ws_mblocks) { + llist_for_each(llnp, wflp->ws_lhg.first) + ctr++; + llist_for_each(llnp, wflp->ws_lhp.first) + ctr++; + WARN_ONCE(ctr != gp_async_max, + "%s: ctr = %d gp_async_max = %d\n", + __func__, ctr, gp_async_max); + kfree(wflp->ws_mblocks); + } + } } kfree(writer_tasks); + writer_tasks = NULL; kfree(writer_durations); + writer_durations = NULL; kfree(writer_n_durations); + writer_n_durations = NULL; + kfree(writer_done); + writer_done = NULL; + kfree(writer_freelists); + writer_freelists = NULL; } /* Do torture-type-specific cleanup operations. */ @@ -949,8 +1082,9 @@ rcu_scale_shutdown(void *arg) static int __init rcu_scale_init(void) { - long i; int firsterr = 0; + long i; + long j; static struct rcu_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS }; @@ -1017,14 +1151,22 @@ rcu_scale_init(void) } while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) schedule_timeout_uninterruptible(1); - writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]), - GFP_KERNEL); - writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), - GFP_KERNEL); - writer_n_durations = - kcalloc(nrealwriters, sizeof(*writer_n_durations), - GFP_KERNEL); - if (!writer_tasks || !writer_durations || !writer_n_durations) { + writer_tasks = kcalloc(nrealwriters, sizeof(writer_tasks[0]), GFP_KERNEL); + writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL); + writer_n_durations = kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); + writer_done = kcalloc(nrealwriters, sizeof(writer_done[0]), GFP_KERNEL); + if (gp_async) { + if (gp_async_max <= 0) { + pr_warn("%s: gp_async_max = %d must be greater than zero.\n", + __func__, gp_async_max); + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + firsterr = -EINVAL; + goto unwind; + } + writer_freelists = kcalloc(nrealwriters, sizeof(writer_freelists[0]), GFP_KERNEL); + } + if (!writer_tasks || !writer_durations || !writer_n_durations || !writer_done || + (gp_async && !writer_freelists)) { SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; @@ -1037,6 +1179,24 @@ rcu_scale_init(void) firsterr = -ENOMEM; goto unwind; } + if (writer_freelists) { + struct writer_freelist *wflp = &writer_freelists[i]; + + init_llist_head(&wflp->ws_lhg); + init_llist_head(&wflp->ws_lhp); + wflp->ws_mblocks = kcalloc(gp_async_max, sizeof(wflp->ws_mblocks[0]), + GFP_KERNEL); + if (!wflp->ws_mblocks) { + firsterr = -ENOMEM; + goto unwind; + } + for (j = 0; j < gp_async_max; j++) { + struct writer_mblock *wmbp = &wflp->ws_mblocks[j]; + + wmbp->wmb_wfl = wflp; + llist_add(&wmbp->wmb_node, &wflp->ws_lhp); + } + } firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, writer_tasks[i]); if (torture_init_error(firsterr)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 08bf7c669dd3..bb75dbf5c800 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -115,6 +115,7 @@ torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s torture_param(bool, stall_no_softlockup, false, "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_cpu_repeat, 0, "Number of additional stalls after the first one."); torture_param(int, stall_gp_kthread, 0, "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -366,8 +367,6 @@ struct rcu_torture_ops { bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2); unsigned long (*get_gp_state)(void); void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp); - unsigned long (*get_gp_completed)(void); - void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp); unsigned long (*start_gp_poll)(void); void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp); bool (*poll_gp_state)(unsigned long oldstate); @@ -375,6 +374,8 @@ struct rcu_torture_ops { bool (*poll_need_2gp)(bool poll, bool poll_full); void (*cond_sync)(unsigned long oldstate); void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp); + int poll_active; + int poll_active_full; call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); @@ -553,8 +554,6 @@ static struct rcu_torture_ops rcu_ops = { .get_comp_state_full = get_completed_synchronize_rcu_full, .get_gp_state = get_state_synchronize_rcu, .get_gp_state_full = get_state_synchronize_rcu_full, - .get_gp_completed = get_completed_synchronize_rcu, - .get_gp_completed_full = get_completed_synchronize_rcu_full, .start_gp_poll = start_poll_synchronize_rcu, .start_gp_poll_full = start_poll_synchronize_rcu_full, .poll_gp_state = poll_state_synchronize_rcu, @@ -562,6 +561,8 @@ static struct rcu_torture_ops rcu_ops = { .poll_need_2gp = rcu_poll_need_2gp, .cond_sync = cond_synchronize_rcu, .cond_sync_full = cond_synchronize_rcu_full, + .poll_active = NUM_ACTIVE_RCU_POLL_OLDSTATE, + .poll_active_full = NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE, .get_gp_state_exp = get_state_synchronize_rcu, .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, @@ -740,9 +741,12 @@ static struct rcu_torture_ops srcu_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -780,9 +784,12 @@ static struct rcu_torture_ops srcud_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .same_gp_state = same_state_synchronize_srcu, + .get_comp_state = get_completed_synchronize_srcu, .get_gp_state = srcu_torture_get_gp_state, .start_gp_poll = srcu_torture_start_gp_poll, .poll_gp_state = srcu_torture_poll_gp_state, + .poll_active = NUM_ACTIVE_SRCU_POLL_OLDSTATE, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -915,11 +922,6 @@ static struct rcu_torture_ops tasks_ops = { * Definitions for rude RCU-tasks torture testing. */ -static void rcu_tasks_rude_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_tasks_rude(&p->rtort_rcu, rcu_torture_cb); -} - static struct rcu_torture_ops tasks_rude_ops = { .ttype = RCU_TASKS_RUDE_FLAVOR, .init = rcu_sync_torture_init, @@ -927,11 +929,8 @@ static struct rcu_torture_ops tasks_rude_ops = { .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, .get_gp_seq = rcu_no_completed, - .deferred_free = rcu_tasks_rude_torture_deferred_free, .sync = synchronize_rcu_tasks_rude, .exp_sync = synchronize_rcu_tasks_rude, - .call = call_rcu_tasks_rude, - .cb_barrier = rcu_barrier_tasks_rude, .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, .get_gp_data = rcu_tasks_rude_get_gp_data, .cbflood_max = 50000, @@ -1318,6 +1317,7 @@ static void rcu_torture_write_types(void) } else if (gp_sync && !cur_ops->sync) { pr_alert("%s: gp_sync without primitives.\n", __func__); } + pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes); } /* @@ -1374,17 +1374,20 @@ rcu_torture_writer(void *arg) int i; int idx; int oldnice = task_nice(current); - struct rcu_gp_oldstate rgo[NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE]; + struct rcu_gp_oldstate *rgo = NULL; + int rgo_size = 0; struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); unsigned long stallsdone = jiffies; bool stutter_waited; - unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + unsigned long *ulo = NULL; + int ulo_size = 0; // If a new stall test is added, this must be adjusted. if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) - stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ; + stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * + HZ * (stall_cpu_repeat + 1); VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) pr_alert("%s" TORTURE_FLAG @@ -1401,6 +1404,16 @@ rcu_torture_writer(void *arg) torture_kthread_stopping("rcu_torture_writer"); return 0; } + if (cur_ops->poll_active > 0) { + ulo = kzalloc(cur_ops->poll_active * sizeof(ulo[0]), GFP_KERNEL); + if (!WARN_ON(!ulo)) + ulo_size = cur_ops->poll_active; + } + if (cur_ops->poll_active_full > 0) { + rgo = kzalloc(cur_ops->poll_active_full * sizeof(rgo[0]), GFP_KERNEL); + if (!WARN_ON(!rgo)) + rgo_size = cur_ops->poll_active_full; + } do { rcu_torture_writer_state = RTWS_FIXED_DELAY; @@ -1437,8 +1450,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); - if (cur_ops->get_gp_completed) { - cookie = cur_ops->get_gp_completed(); + if (cur_ops->get_comp_state) { + cookie = cur_ops->get_comp_state(); WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); } cur_ops->readunlock(idx); @@ -1452,8 +1465,8 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state_getname(), rcu_torture_writer_state, cpumask_pr_args(cpu_online_mask)); - if (cur_ops->get_gp_completed_full) { - cur_ops->get_gp_completed_full(&cookie_full); + if (cur_ops->get_comp_state_full) { + cur_ops->get_comp_state_full(&cookie_full); WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full)); } cur_ops->readunlock(idx); @@ -1502,19 +1515,19 @@ rcu_torture_writer(void *arg) break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; - for (i = 0; i < ARRAY_SIZE(ulo); i++) + for (i = 0; i < ulo_size; i++) ulo[i] = cur_ops->get_comp_state(); gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; while (!cur_ops->poll_gp_state(gp_snap)) { gp_snap1 = cur_ops->get_gp_state(); - for (i = 0; i < ARRAY_SIZE(ulo); i++) + for (i = 0; i < ulo_size; i++) if (cur_ops->poll_gp_state(ulo[i]) || cur_ops->same_gp_state(ulo[i], gp_snap1)) { ulo[i] = gp_snap1; break; } - WARN_ON_ONCE(i >= ARRAY_SIZE(ulo)); + WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size); torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } @@ -1522,20 +1535,20 @@ rcu_torture_writer(void *arg) break; case RTWS_POLL_GET_FULL: rcu_torture_writer_state = RTWS_POLL_GET_FULL; - for (i = 0; i < ARRAY_SIZE(rgo); i++) + for (i = 0; i < rgo_size; i++) cur_ops->get_comp_state_full(&rgo[i]); cur_ops->start_gp_poll_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_FULL; while (!cur_ops->poll_gp_state_full(&gp_snap_full)) { cur_ops->get_gp_state_full(&gp_snap1_full); - for (i = 0; i < ARRAY_SIZE(rgo); i++) + for (i = 0; i < rgo_size; i++) if (cur_ops->poll_gp_state_full(&rgo[i]) || cur_ops->same_gp_state_full(&rgo[i], &gp_snap1_full)) { rgo[i] = gp_snap1_full; break; } - WARN_ON_ONCE(i >= ARRAY_SIZE(rgo)); + WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size); torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); } @@ -1617,6 +1630,8 @@ rcu_torture_writer(void *arg) pr_alert("%s" TORTURE_FLAG " Dynamic grace-period expediting was disabled.\n", torture_type); + kfree(ulo); + kfree(rgo); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; @@ -2370,7 +2385,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " - "stall_cpu_block=%d " + "stall_cpu_block=%d stall_cpu_repeat=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " "read_exit_delay=%d read_exit_burst=%d " @@ -2382,7 +2397,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, - stall_cpu_block, + stall_cpu_block, stall_cpu_repeat, n_barrier_cbs, onoff_interval, onoff_holdoff, read_exit_delay, read_exit_burst, @@ -2460,19 +2475,11 @@ static struct notifier_block rcu_torture_stall_block = { * induces a CPU stall for the time specified by stall_cpu. If a new * stall test is added, stallsdone in rcu_torture_writer() must be adjusted. */ -static int rcu_torture_stall(void *args) +static void rcu_torture_stall_one(int rep, int irqsoff) { int idx; - int ret; unsigned long stop_at; - VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); - if (rcu_cpu_stall_notifiers) { - ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); - if (ret) - pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", - __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); - } if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); @@ -2492,12 +2499,12 @@ static int rcu_torture_stall(void *args) stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ idx = cur_ops->readlock(); - if (stall_cpu_irqsoff) + if (irqsoff) local_irq_disable(); else if (!stall_cpu_block) preempt_disable(); - pr_alert("%s start on CPU %d.\n", - __func__, raw_smp_processor_id()); + pr_alert("%s start stall episode %d on CPU %d.\n", + __func__, rep + 1, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) && !kthread_should_stop()) if (stall_cpu_block) { @@ -2509,12 +2516,42 @@ static int rcu_torture_stall(void *args) } else if (stall_no_softlockup) { touch_softlockup_watchdog(); } - if (stall_cpu_irqsoff) + if (irqsoff) local_irq_enable(); else if (!stall_cpu_block) preempt_enable(); cur_ops->readunlock(idx); } +} + +/* + * CPU-stall kthread. Invokes rcu_torture_stall_one() once, and then as many + * additional times as specified by the stall_cpu_repeat module parameter. + * Note that stall_cpu_irqsoff is ignored on the second and subsequent + * stall. + */ +static int rcu_torture_stall(void *args) +{ + int i; + int repeat = stall_cpu_repeat; + int ret; + + VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); + if (repeat < 0) { + repeat = 0; + WARN_ON_ONCE(IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)); + } + if (rcu_cpu_stall_notifiers) { + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); + } + for (i = 0; i <= repeat; i++) { + if (kthread_should_stop()) + break; + rcu_torture_stall_one(i, i == 0 ? stall_cpu_irqsoff : 0); + } pr_alert("%s end.\n", __func__); if (rcu_cpu_stall_notifiers && !ret) { ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); @@ -2680,7 +2717,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(struct rcu_fwd *rfp) rcu_torture_fwd_prog_cond_resched(freed); if (tick_nohz_full_enabled()) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } } @@ -2830,7 +2867,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); if (tick_nohz_full_enabled()) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } } diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index f4ea5b1ec068..0db9db73f57f 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -28,6 +28,7 @@ #include <linux/rcupdate_trace.h> #include <linux/reboot.h> #include <linux/sched.h> +#include <linux/seq_buf.h> #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/stat.h> @@ -134,7 +135,7 @@ struct ref_scale_ops { const char *name; }; -static struct ref_scale_ops *cur_ops; +static const struct ref_scale_ops *cur_ops; static void un_delay(const int udl, const int ndl) { @@ -170,7 +171,7 @@ static bool rcu_sync_scale_init(void) return true; } -static struct ref_scale_ops rcu_ops = { +static const struct ref_scale_ops rcu_ops = { .init = rcu_sync_scale_init, .readsection = ref_rcu_read_section, .delaysection = ref_rcu_delay_section, @@ -204,7 +205,7 @@ static void srcu_ref_scale_delay_section(const int nloops, const int udl, const } } -static struct ref_scale_ops srcu_ops = { +static const struct ref_scale_ops srcu_ops = { .init = rcu_sync_scale_init, .readsection = srcu_ref_scale_read_section, .delaysection = srcu_ref_scale_delay_section, @@ -231,7 +232,7 @@ static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, c un_delay(udl, ndl); } -static struct ref_scale_ops rcu_tasks_ops = { +static const struct ref_scale_ops rcu_tasks_ops = { .init = rcu_sync_scale_init, .readsection = rcu_tasks_ref_scale_read_section, .delaysection = rcu_tasks_ref_scale_delay_section, @@ -270,7 +271,7 @@ static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, c } } -static struct ref_scale_ops rcu_trace_ops = { +static const struct ref_scale_ops rcu_trace_ops = { .init = rcu_sync_scale_init, .readsection = rcu_trace_ref_scale_read_section, .delaysection = rcu_trace_ref_scale_delay_section, @@ -309,7 +310,7 @@ static void ref_refcnt_delay_section(const int nloops, const int udl, const int } } -static struct ref_scale_ops refcnt_ops = { +static const struct ref_scale_ops refcnt_ops = { .init = rcu_sync_scale_init, .readsection = ref_refcnt_section, .delaysection = ref_refcnt_delay_section, @@ -346,7 +347,7 @@ static void ref_rwlock_delay_section(const int nloops, const int udl, const int } } -static struct ref_scale_ops rwlock_ops = { +static const struct ref_scale_ops rwlock_ops = { .init = ref_rwlock_init, .readsection = ref_rwlock_section, .delaysection = ref_rwlock_delay_section, @@ -383,7 +384,7 @@ static void ref_rwsem_delay_section(const int nloops, const int udl, const int n } } -static struct ref_scale_ops rwsem_ops = { +static const struct ref_scale_ops rwsem_ops = { .init = ref_rwsem_init, .readsection = ref_rwsem_section, .delaysection = ref_rwsem_delay_section, @@ -418,7 +419,7 @@ static void ref_lock_delay_section(const int nloops, const int udl, const int nd preempt_enable(); } -static struct ref_scale_ops lock_ops = { +static const struct ref_scale_ops lock_ops = { .readsection = ref_lock_section, .delaysection = ref_lock_delay_section, .name = "lock" @@ -453,7 +454,7 @@ static void ref_lock_irq_delay_section(const int nloops, const int udl, const in preempt_enable(); } -static struct ref_scale_ops lock_irq_ops = { +static const struct ref_scale_ops lock_irq_ops = { .readsection = ref_lock_irq_section, .delaysection = ref_lock_irq_delay_section, .name = "lock-irq" @@ -489,7 +490,7 @@ static void ref_acqrel_delay_section(const int nloops, const int udl, const int preempt_enable(); } -static struct ref_scale_ops acqrel_ops = { +static const struct ref_scale_ops acqrel_ops = { .readsection = ref_acqrel_section, .delaysection = ref_acqrel_delay_section, .name = "acqrel" @@ -523,7 +524,7 @@ static void ref_clock_delay_section(const int nloops, const int udl, const int n stopopts = x; } -static struct ref_scale_ops clock_ops = { +static const struct ref_scale_ops clock_ops = { .readsection = ref_clock_section, .delaysection = ref_clock_delay_section, .name = "clock" @@ -555,7 +556,7 @@ static void ref_jiffies_delay_section(const int nloops, const int udl, const int stopopts = x; } -static struct ref_scale_ops jiffies_ops = { +static const struct ref_scale_ops jiffies_ops = { .readsection = ref_jiffies_section, .delaysection = ref_jiffies_delay_section, .name = "jiffies" @@ -705,9 +706,9 @@ static void refscale_typesafe_ctor(void *rtsp_in) preempt_enable(); } -static struct ref_scale_ops typesafe_ref_ops; -static struct ref_scale_ops typesafe_lock_ops; -static struct ref_scale_ops typesafe_seqlock_ops; +static const struct ref_scale_ops typesafe_ref_ops; +static const struct ref_scale_ops typesafe_lock_ops; +static const struct ref_scale_ops typesafe_seqlock_ops; // Initialize for a typesafe test. static bool typesafe_init(void) @@ -768,7 +769,7 @@ static void typesafe_cleanup(void) } // The typesafe_init() function distinguishes these structures by address. -static struct ref_scale_ops typesafe_ref_ops = { +static const struct ref_scale_ops typesafe_ref_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -776,7 +777,7 @@ static struct ref_scale_ops typesafe_ref_ops = { .name = "typesafe_ref" }; -static struct ref_scale_ops typesafe_lock_ops = { +static const struct ref_scale_ops typesafe_lock_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -784,7 +785,7 @@ static struct ref_scale_ops typesafe_lock_ops = { .name = "typesafe_lock" }; -static struct ref_scale_ops typesafe_seqlock_ops = { +static const struct ref_scale_ops typesafe_seqlock_ops = { .init = typesafe_init, .cleanup = typesafe_cleanup, .readsection = typesafe_read_section, @@ -891,32 +892,34 @@ static u64 process_durations(int n) { int i; struct reader_task *rt; - char buf1[64]; + struct seq_buf s; char *buf; u64 sum = 0; buf = kmalloc(800 + 64, GFP_KERNEL); if (!buf) return 0; - buf[0] = 0; - sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)", - exp_idx); + seq_buf_init(&s, buf, 800 + 64); + + seq_buf_printf(&s, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)", + exp_idx); for (i = 0; i < n && !torture_must_stop(); i++) { rt = &(reader_tasks[i]); - sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns); if (i % 5 == 0) - strcat(buf, "\n"); - if (strlen(buf) >= 800) { - pr_alert("%s", buf); - buf[0] = 0; + seq_buf_putc(&s, '\n'); + + if (seq_buf_used(&s) >= 800) { + pr_alert("%s", seq_buf_str(&s)); + seq_buf_clear(&s); } - strcat(buf, buf1); + + seq_buf_printf(&s, "%d: %llu\t", i, rt->last_duration_ns); sum += rt->last_duration_ns; } - pr_alert("%s\n", buf); + pr_alert("%s\n", seq_buf_str(&s)); kfree(buf); return sum; @@ -1023,7 +1026,7 @@ end: } static void -ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) +ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, @@ -1078,7 +1081,7 @@ ref_scale_init(void) { long i; int firsterr = 0; - static struct ref_scale_ops *scale_ops[] = { + static const struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index b24db425f16d..31706e3293bc 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -137,6 +137,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp) sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq; + sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head; sdp->mynode = NULL; sdp->cpu = cpu; INIT_WORK(&sdp->work, srcu_invoke_callbacks); @@ -247,7 +248,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) mutex_init(&ssp->srcu_sup->srcu_cb_mutex); mutex_init(&ssp->srcu_sup->srcu_gp_mutex); ssp->srcu_idx = 0; - ssp->srcu_sup->srcu_gp_seq = 0; + ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_barrier_seq = 0; mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); @@ -258,7 +259,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) if (!ssp->sda) goto err_free_sup; init_srcu_struct_data(ssp); - ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; + ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) @@ -266,7 +267,8 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } ssp->srcu_sup->srcu_ssp = ssp; - smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ + smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, + SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */ return 0; err_free_sda: @@ -628,6 +630,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) if (time_after(j, gpstart)) jbase += j - gpstart; if (!jbase) { + ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay); WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1); if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) jbase = 1; @@ -1560,6 +1563,7 @@ static void srcu_barrier_cb(struct rcu_head *rhp) struct srcu_data *sdp; struct srcu_struct *ssp; + rhp->next = rhp; // Mark the callback as having been invoked. sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); ssp = sdp->ssp; if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) @@ -1818,6 +1822,7 @@ static void process_srcu(struct work_struct *work) } else { j = jiffies; if (READ_ONCE(sup->reschedule_jiffies) == j) { + ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count); WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1); if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay) curdelay = 1; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ba3440a45b6d..6333f4ccf024 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -34,6 +34,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_blkd_tasks: List of tasks blocked as readers. * @rtp_exit_list: List of tasks in the latter portion of do_exit(). * @cpu: CPU number corresponding to this entry. + * @index: Index of this CPU in rtpcp_array of the rcu_tasks structure. * @rtpp: Pointer to the rcu_tasks structure. */ struct rcu_tasks_percpu { @@ -49,6 +50,7 @@ struct rcu_tasks_percpu { struct list_head rtp_blkd_tasks; struct list_head rtp_exit_list; int cpu; + int index; struct rcu_tasks *rtpp; }; @@ -63,7 +65,7 @@ struct rcu_tasks_percpu { * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. - * @tasks_gp_seq: Number of grace periods completed since boot. + * @tasks_gp_seq: Number of grace periods completed since boot in upper bits. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. @@ -76,6 +78,7 @@ struct rcu_tasks_percpu { * @call_func: This flavor's call_rcu()-equivalent function. * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE). * @rtpcpu: This flavor's rcu_tasks_percpu structure. + * @rtpcp_array: Array of pointers to rcu_tasks_percpu structure of CPUs in cpu_possible_mask. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. @@ -84,6 +87,7 @@ struct rcu_tasks_percpu { * @barrier_q_count: Number of queues being waited on. * @barrier_q_completion: Barrier wait/wakeup mechanism. * @barrier_q_seq: Sequence number for barrier operations. + * @barrier_q_start: Most recent barrier start in jiffies. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ @@ -110,6 +114,7 @@ struct rcu_tasks { call_rcu_func_t call_func; unsigned int wait_state; struct rcu_tasks_percpu __percpu *rtpcpu; + struct rcu_tasks_percpu **rtpcp_array; int percpu_enqueue_shift; int percpu_enqueue_lim; int percpu_dequeue_lim; @@ -118,6 +123,7 @@ struct rcu_tasks { atomic_t barrier_q_count; struct completion barrier_q_completion; unsigned long barrier_q_seq; + unsigned long barrier_q_start; char *name; char *kname; }; @@ -182,6 +188,8 @@ module_param(rcu_task_collapse_lim, int, 0444); static int rcu_task_lazy_lim __read_mostly = 32; module_param(rcu_task_lazy_lim, int, 0444); +static int rcu_task_cpu_ids; + /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 #define RTGS_WAIT_WAIT_CBS 1 @@ -245,6 +253,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int cpu; int lim; int shift; + int maxcpu; + int index = 0; if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; @@ -254,14 +264,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp) } lim = rcu_task_enqueue_lim; - if (lim > nr_cpu_ids) - lim = nr_cpu_ids; - shift = ilog2(nr_cpu_ids / lim); - if (((nr_cpu_ids - 1) >> shift) >= lim) - shift++; - WRITE_ONCE(rtp->percpu_enqueue_shift, shift); - WRITE_ONCE(rtp->percpu_dequeue_lim, lim); - smp_store_release(&rtp->percpu_enqueue_lim, lim); + rtp->rtpcp_array = kcalloc(num_possible_cpus(), sizeof(struct rcu_tasks_percpu *), GFP_KERNEL); + BUG_ON(!rtp->rtpcp_array); + for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); @@ -273,14 +278,30 @@ static void cblist_init_generic(struct rcu_tasks *rtp) INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; + rtpcp->index = index; + rtp->rtpcp_array[index] = rtpcp; + index++; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); if (!rtpcp->rtp_exit_list.next) INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + rtpcp->barrier_q_head.next = &rtpcp->barrier_q_head; + maxcpu = cpu; } - pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, - data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); + rcu_task_cpu_ids = maxcpu + 1; + if (lim > rcu_task_cpu_ids) + lim = rcu_task_cpu_ids; + shift = ilog2(rcu_task_cpu_ids / lim); + if (((rcu_task_cpu_ids - 1) >> shift) >= lim) + shift++; + WRITE_ONCE(rtp->percpu_enqueue_shift, shift); + WRITE_ONCE(rtp->percpu_dequeue_lim, lim); + smp_store_release(&rtp->percpu_enqueue_lim, lim); + + pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d rcu_task_cpu_ids=%d.\n", + rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), + rcu_task_cb_adjust, rcu_task_cpu_ids); } // Compute wakeup time for lazy callback timer. @@ -339,6 +360,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rcu_read_lock(); ideal_cpu = smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift); chosen_cpu = cpumask_next(ideal_cpu - 1, cpu_possible_mask); + WARN_ON_ONCE(chosen_cpu >= rcu_task_cpu_ids); rtpcp = per_cpu_ptr(rtp->rtpcpu, chosen_cpu); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. @@ -348,7 +370,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rtpcp->rtp_n_lock_retries = 0; } if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim && - READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) + READ_ONCE(rtp->percpu_enqueue_lim) != rcu_task_cpu_ids) needadjust = true; // Defer adjustment to avoid deadlock. } // Queuing callbacks before initialization not yet supported. @@ -368,10 +390,10 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); if (unlikely(needadjust)) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); - if (rtp->percpu_enqueue_lim != nr_cpu_ids) { + if (rtp->percpu_enqueue_lim != rcu_task_cpu_ids) { WRITE_ONCE(rtp->percpu_enqueue_shift, 0); - WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); - smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); + WRITE_ONCE(rtp->percpu_dequeue_lim, rcu_task_cpu_ids); + smp_store_release(&rtp->percpu_enqueue_lim, rcu_task_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); @@ -388,6 +410,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) struct rcu_tasks *rtp; struct rcu_tasks_percpu *rtpcp; + rhp->next = rhp; // Mark the callback as having been invoked. rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head); rtp = rtpcp->rtpp; if (atomic_dec_and_test(&rtp->barrier_q_count)) @@ -396,7 +419,7 @@ static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) // Wait for all in-flight callbacks for the specified RCU Tasks flavor. // Operates in a manner similar to rcu_barrier(). -static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) +static void __maybe_unused rcu_barrier_tasks_generic(struct rcu_tasks *rtp) { int cpu; unsigned long flags; @@ -409,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) mutex_unlock(&rtp->barrier_q_mutex); return; } + rtp->barrier_q_start = jiffies; rcu_seq_start(&rtp->barrier_q_seq); init_completion(&rtp->barrier_q_completion); atomic_set(&rtp->barrier_q_count, 2); @@ -444,6 +468,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim); for (cpu = 0; cpu < dequeue_limit; cpu++) { + if (!cpu_possible(cpu)) + continue; struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ @@ -481,7 +507,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim > 1) { - WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(rcu_task_cpu_ids)); smp_store_release(&rtp->percpu_enqueue_lim, 1); rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); gpdone = false; @@ -496,7 +522,9 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); } if (rtp->percpu_dequeue_lim == 1) { - for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { + for (cpu = rtp->percpu_dequeue_lim; cpu < rcu_task_cpu_ids; cpu++) { + if (!cpu_possible(cpu)) + continue; struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); @@ -511,30 +539,32 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) // Advance callbacks and invoke any that are ready. static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp) { - int cpu; - int cpunext; int cpuwq; unsigned long flags; int len; + int index; struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); struct rcu_tasks_percpu *rtpcp_next; - cpu = rtpcp->cpu; - cpunext = cpu * 2 + 1; - if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { - rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; - queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); - cpunext++; - if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { - rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); - cpuwq = rcu_cpu_beenfullyonline(cpunext) ? cpunext : WORK_CPU_UNBOUND; + index = rtpcp->index * 2 + 1; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + index++; + if (index < num_possible_cpus()) { + rtpcp_next = rtp->rtpcp_array[index]; + if (rtpcp_next->cpu < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + cpuwq = rcu_cpu_beenfullyonline(rtpcp_next->cpu) ? rtpcp_next->cpu : WORK_CPU_UNBOUND; + queue_work_on(cpuwq, system_wq, &rtpcp_next->rtp_work); + } + } } } - if (rcu_segcblist_empty(&rtpcp->cblist) || !cpu_possible(cpu)) + if (rcu_segcblist_empty(&rtpcp->cblist)) return; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); @@ -687,9 +717,7 @@ static void __init rcu_tasks_bootup_oddness(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -#endif /* #ifndef CONFIG_TINY_RCU */ -#ifndef CONFIG_TINY_RCU /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { @@ -723,6 +751,53 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) rtp->lazy_jiffies, s); } + +/* Dump out more rcutorture-relevant state common to all RCU-tasks flavors. */ +static void rcu_tasks_torture_stats_print_generic(struct rcu_tasks *rtp, char *tt, + char *tf, char *tst) +{ + cpumask_var_t cm; + int cpu; + bool gotcb = false; + unsigned long j = jiffies; + + pr_alert("%s%s Tasks%s RCU g%ld gp_start %lu gp_jiffies %lu gp_state %d (%s).\n", + tt, tf, tst, data_race(rtp->tasks_gp_seq), + j - data_race(rtp->gp_start), j - data_race(rtp->gp_jiffies), + data_race(rtp->gp_state), tasks_gp_state_getname(rtp)); + pr_alert("\tEnqueue shift %d limit %d Dequeue limit %d gpseq %lu.\n", + data_race(rtp->percpu_enqueue_shift), + data_race(rtp->percpu_enqueue_lim), + data_race(rtp->percpu_dequeue_lim), + data_race(rtp->percpu_dequeue_gpseq)); + (void)zalloc_cpumask_var(&cm, GFP_KERNEL); + pr_alert("\tCallback counts:"); + for_each_possible_cpu(cpu) { + long n; + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (cpumask_available(cm) && !rcu_barrier_cb_is_done(&rtpcp->barrier_q_head)) + cpumask_set_cpu(cpu, cm); + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (!n) + continue; + pr_cont(" %d:%ld", cpu, n); + gotcb = true; + } + if (gotcb) + pr_cont(".\n"); + else + pr_cont(" (none).\n"); + pr_alert("\tBarrier seq %lu start %lu count %d holdout CPUs ", + data_race(rtp->barrier_q_seq), j - data_race(rtp->barrier_q_start), + atomic_read(&rtp->barrier_q_count)); + if (cpumask_available(cm) && !cpumask_empty(cm)) + pr_cont(" %*pbl.\n", cpumask_pr_args(cm)); + else + pr_cont("(none).\n"); + free_cpumask_var(cm); +} + #endif // #ifndef CONFIG_TINY_RCU static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -1174,6 +1249,12 @@ void show_rcu_tasks_classic_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); + +void rcu_tasks_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_gp_kthread(void) @@ -1244,13 +1325,12 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } //////////////////////////////////////////////////////////////////////// // -// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of -// passing an empty function to schedule_on_each_cpu(). This approach -// provides an asynchronous call_rcu_tasks_rude() API and batching of -// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API. -// This invokes schedule_on_each_cpu() in order to send IPIs far and wide -// and induces otherwise unnecessary context switches on all online CPUs, -// whether idle or not. +// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's +// trick of passing an empty function to schedule_on_each_cpu(). +// This approach provides batching of concurrent calls to the synchronous +// synchronize_rcu_tasks_rude() API. This invokes schedule_on_each_cpu() +// in order to send IPIs far and wide and induces otherwise unnecessary +// context switches on all online CPUs, whether idle or not. // // Callback handling is provided by the rcu_tasks_kthread() function. // @@ -1268,11 +1348,11 @@ static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp) schedule_on_each_cpu(rcu_tasks_be_rude); } -void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func); DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, "RCU Tasks Rude"); -/** +/* * call_rcu_tasks_rude() - Queue a callback rude task-based grace period * @rhp: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period @@ -1289,12 +1369,14 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. + * + * This is no longer exported, and is instead reserved for use by + * synchronize_rcu_tasks_rude(). */ -void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) +static void call_rcu_tasks_rude(struct rcu_head *rhp, rcu_callback_t func) { call_rcu_tasks_generic(rhp, func, &rcu_tasks_rude); } -EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); /** * synchronize_rcu_tasks_rude - wait for a rude rcu-tasks grace period @@ -1320,26 +1402,9 @@ void synchronize_rcu_tasks_rude(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); -/** - * rcu_barrier_tasks_rude - Wait for in-flight call_rcu_tasks_rude() callbacks. - * - * Although the current implementation is guaranteed to wait, it is not - * obligated to, for example, if there are no pending callbacks. - */ -void rcu_barrier_tasks_rude(void) -{ - rcu_barrier_tasks_generic(&rcu_tasks_rude); -} -EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); - -int rcu_tasks_rude_lazy_ms = -1; -module_param(rcu_tasks_rude_lazy_ms, int, 0444); - static int __init rcu_spawn_tasks_rude_kthread(void) { rcu_tasks_rude.gp_sleep = HZ / 10; - if (rcu_tasks_rude_lazy_ms >= 0) - rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms); rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } @@ -1350,6 +1415,12 @@ void show_rcu_tasks_rude_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); + +void rcu_tasks_rude_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_rude, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_rude_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_rude_gp_kthread(void) @@ -1613,7 +1684,7 @@ static int trc_inspect_reader(struct task_struct *t, void *bhp_in) // However, we cannot safely change its state. n_heavy_reader_attempts++; // Check for "running" idle tasks on offline CPUs. - if (!rcu_dynticks_zero_in_eqs(cpu, &t->trc_reader_nesting)) + if (!rcu_watching_zero_in_eqs(cpu, &t->trc_reader_nesting)) return -EINVAL; // No quiescent state, do it the hard way. n_heavy_reader_updates++; nesting = 0; @@ -2027,6 +2098,12 @@ void show_rcu_tasks_trace_gp_kthread(void) show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); + +void rcu_tasks_trace_torture_stats_print(char *tt, char *tf) +{ + rcu_tasks_torture_stats_print_generic(&rcu_tasks_trace, tt, tf, ""); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_torture_stats_print); #endif // !defined(CONFIG_TINY_RCU) struct task_struct *get_rcu_tasks_trace_gp_kthread(void) @@ -2070,17 +2147,13 @@ static struct rcu_tasks_test_desc tests[] = { .notrun = IS_ENABLED(CONFIG_TASKS_RCU), }, { - .name = "call_rcu_tasks_rude()", - /* If not defined, the test is skipped. */ - .notrun = IS_ENABLED(CONFIG_TASKS_RUDE_RCU), - }, - { .name = "call_rcu_tasks_trace()", /* If not defined, the test is skipped. */ .notrun = IS_ENABLED(CONFIG_TASKS_TRACE_RCU) } }; +#if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) static void test_rcu_tasks_callback(struct rcu_head *rhp) { struct rcu_tasks_test_desc *rttd = @@ -2090,6 +2163,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) rttd->notrun = false; } +#endif // #if defined(CONFIG_TASKS_RCU) || defined(CONFIG_TASKS_TRACE_RCU) static void rcu_tasks_initiate_self_tests(void) { @@ -2102,16 +2176,14 @@ static void rcu_tasks_initiate_self_tests(void) #ifdef CONFIG_TASKS_RUDE_RCU pr_info("Running RCU Tasks Rude wait API self tests\n"); - tests[1].runstart = jiffies; synchronize_rcu_tasks_rude(); - call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU pr_info("Running RCU Tasks Trace wait API self tests\n"); - tests[2].runstart = jiffies; + tests[1].runstart = jiffies; synchronize_rcu_tasks_trace(); - call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); + call_rcu_tasks_trace(&tests[1].rh, test_rcu_tasks_callback); #endif } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4402d6f5f857..b3b3ce34df63 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -105,7 +105,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) } /* Invoke the RCU callbacks whose grace period has elapsed. */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(void) { struct rcu_head *next, *list; unsigned long flags; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..a60616e69b66 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -79,9 +79,6 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, -#ifdef CONFIG_RCU_NOCB_CPU - .cblist.flags = SEGCBLIST_RCU_CORE, -#endif }; static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, @@ -97,6 +94,9 @@ static struct rcu_state rcu_state = { .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, rcu_sr_normal_gp_cleanup_work), .srs_cleanups_pending = ATOMIC_INIT(0), +#ifdef CONFIG_RCU_NOCB_CPU + .nocb_mutex = __MUTEX_INITIALIZER(rcu_state.nocb_mutex), +#endif }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -283,37 +283,45 @@ void rcu_softirq_qs(void) } /* - * Reset the current CPU's ->dynticks counter to indicate that the + * Reset the current CPU's RCU_WATCHING counter to indicate that the * newly onlined CPU is no longer in an extended quiescent state. * This will either leave the counter unchanged, or increment it * to the next non-quiescent value. * * The non-atomic test/increment sequence works because the upper bits - * of the ->dynticks counter are manipulated only by the corresponding CPU, + * of the ->state variable are manipulated only by the corresponding CPU, * or when the corresponding CPU is offline. */ -static void rcu_dynticks_eqs_online(void) +static void rcu_watching_online(void) { - if (ct_dynticks() & RCU_DYNTICKS_IDX) + if (ct_rcu_watching() & CT_RCU_WATCHING) return; - ct_state_inc(RCU_DYNTICKS_IDX); + ct_state_inc(CT_RCU_WATCHING); } /* - * Return true if the snapshot returned from rcu_dynticks_snap() + * Return true if the snapshot returned from ct_rcu_watching() * indicates that RCU is in an extended quiescent state. */ -static bool rcu_dynticks_in_eqs(int snap) +static bool rcu_watching_snap_in_eqs(int snap) { - return !(snap & RCU_DYNTICKS_IDX); + return !(snap & CT_RCU_WATCHING); } -/* - * Return true if the CPU corresponding to the specified rcu_data - * structure has spent some time in an extended quiescent state since - * rcu_dynticks_snap() returned the specified snapshot. +/** + * rcu_watching_snap_stopped_since() - Has RCU stopped watching a given CPU + * since the specified @snap? + * + * @rdp: The rcu_data corresponding to the CPU for which to check EQS. + * @snap: rcu_watching snapshot taken when the CPU wasn't in an EQS. + * + * Returns true if the CPU corresponding to @rdp has spent some time in an + * extended quiescent state since @snap. Note that this doesn't check if it + * /still/ is in an EQS, just that it went through one since @snap. + * + * This is meant to be used in a loop waiting for a CPU to go through an EQS. */ -static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) +static bool rcu_watching_snap_stopped_since(struct rcu_data *rdp, int snap) { /* * The first failing snapshot is already ordered against the accesses @@ -323,26 +331,29 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) * performed by the remote CPU prior to entering idle and therefore can * rely solely on acquire semantics. */ - return snap != ct_dynticks_cpu_acquire(rdp->cpu); + if (WARN_ON_ONCE(rcu_watching_snap_in_eqs(snap))) + return true; + + return snap != ct_rcu_watching_cpu_acquire(rdp->cpu); } /* * Return true if the referenced integer is zero while the specified * CPU remains within a single extended quiescent state. */ -bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) +bool rcu_watching_zero_in_eqs(int cpu, int *vp) { int snap; // If not quiescent, force back to earlier extended quiescent state. - snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX; - smp_rmb(); // Order ->dynticks and *vp reads. + snap = ct_rcu_watching_cpu(cpu) & ~CT_RCU_WATCHING; + smp_rmb(); // Order CT state and *vp reads. if (READ_ONCE(*vp)) return false; // Non-zero, so report failure; - smp_rmb(); // Order *vp read and ->dynticks re-read. + smp_rmb(); // Order *vp read and CT state re-read. // If still in the same extended quiescent state, we are good! - return snap == ct_dynticks_cpu(cpu); + return snap == ct_rcu_watching_cpu(cpu); } /* @@ -356,17 +367,17 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) * * The caller must have disabled interrupts and must not be idle. */ -notrace void rcu_momentary_dyntick_idle(void) +notrace void rcu_momentary_eqs(void) { int seq; raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); - seq = ct_state_inc(2 * RCU_DYNTICKS_IDX); + seq = ct_state_inc(2 * CT_RCU_WATCHING); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX)); + WARN_ON_ONCE(!(seq & CT_RCU_WATCHING)); rcu_preempt_deferred_qs(current); } -EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); +EXPORT_SYMBOL_GPL(rcu_momentary_eqs); /** * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle @@ -388,13 +399,13 @@ static int rcu_is_cpu_rrupt_from_idle(void) lockdep_assert_irqs_disabled(); /* Check for counter underflows */ - RCU_LOCKDEP_WARN(ct_dynticks_nesting() < 0, - "RCU dynticks_nesting counter underflow!"); - RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= 0, - "RCU dynticks_nmi_nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(ct_nesting() < 0, + "RCU nesting counter underflow!"); + RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0, + "RCU nmi_nesting counter underflow/zero!"); /* Are we at first interrupt nesting level? */ - nesting = ct_dynticks_nmi_nesting(); + nesting = ct_nmi_nesting(); if (nesting > 1) return false; @@ -404,7 +415,7 @@ static int rcu_is_cpu_rrupt_from_idle(void) WARN_ON_ONCE(!nesting && !is_idle_task(current)); /* Does CPU appear to be idle from an RCU standpoint? */ - return ct_dynticks_nesting() == 0; + return ct_nesting() == 0; } #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) @@ -596,12 +607,12 @@ void rcu_irq_exit_check_preempt(void) { lockdep_assert_irqs_disabled(); - RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= 0, - "RCU dynticks_nesting counter underflow/zero!"); - RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() != - DYNTICK_IRQ_NONIDLE, - "Bad RCU dynticks_nmi_nesting counter\n"); - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + RCU_LOCKDEP_WARN(ct_nesting() <= 0, + "RCU nesting counter underflow/zero!"); + RCU_LOCKDEP_WARN(ct_nmi_nesting() != + CT_NESTING_IRQ_NONIDLE, + "Bad RCU nmi_nesting counter\n"); + RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), "RCU in extended quiescent state!"); } #endif /* #ifdef CONFIG_PROVE_RCU */ @@ -641,7 +652,7 @@ void __rcu_irq_enter_check_tick(void) if (in_nmi()) return; - RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), + RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); if (!tick_nohz_full_cpu(rdp->cpu) || @@ -723,7 +734,7 @@ notrace bool rcu_is_watching(void) bool ret; preempt_disable_notrace(); - ret = !rcu_dynticks_curr_cpu_in_eqs(); + ret = rcu_is_watching_curr_cpu(); preempt_enable_notrace(); return ret; } @@ -765,11 +776,11 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) } /* - * Snapshot the specified CPU's dynticks counter so that we can later + * Snapshot the specified CPU's RCU_WATCHING counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp) +static int rcu_watching_snap_save(struct rcu_data *rdp) { /* * Full ordering between remote CPU's post idle accesses and updater's @@ -782,8 +793,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * Ordering between remote CPU's pre idle accesses and post grace period * updater's accesses is enforced by the below acquire semantic. */ - rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu); - if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { + rdp->watching_snap = ct_rcu_watching_cpu_acquire(rdp->cpu); + if (rcu_watching_snap_in_eqs(rdp->watching_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rdp->mynode, rdp); return 1; @@ -794,14 +805,14 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) /* * Returns positive if the specified CPU has passed through a quiescent state * by virtue of being in or having passed through an dynticks idle state since - * the last call to dyntick_save_progress_counter() for this same CPU, or by + * the last call to rcu_watching_snap_save() for this same CPU, or by * virtue of having been offline. * * Returns negative if the specified CPU needs a force resched. * * Returns zero otherwise. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +static int rcu_watching_snap_recheck(struct rcu_data *rdp) { unsigned long jtsq; int ret = 0; @@ -815,7 +826,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->watching_snap)) { trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); rcu_gpnum_ovf(rnp, rdp); return 1; @@ -1649,7 +1660,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) * the done tail list manipulations are protected here. */ done = smp_load_acquire(&rcu_state.srs_done_tail); - if (!done) + if (WARN_ON_ONCE(!done)) return; WARN_ON_ONCE(!rcu_sr_is_wait_head(done)); @@ -1984,10 +1995,10 @@ static void rcu_gp_fqs(bool first_time) if (first_time) { /* Collect dyntick-idle snapshots. */ - force_qs_rnp(dyntick_save_progress_counter); + force_qs_rnp(rcu_watching_snap_save); } else { /* Handle dyntick-idle and offline CPUs. */ - force_qs_rnp(rcu_implicit_dynticks_qs); + force_qs_rnp(rcu_watching_snap_recheck); } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { @@ -2383,7 +2394,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) { unsigned long flags; unsigned long mask; - bool needacc = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2420,23 +2430,11 @@ rcu_report_qs_rdp(struct rcu_data *rdp) * to return true. So complain, but don't awaken. */ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp)); - } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - /* - * ...but NOCB kthreads may miss or delay callbacks acceleration - * if in the middle of a (de-)offloading process. - */ - needacc = true; } rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ - - if (needacc) { - rcu_nocb_lock_irqsave(rdp, flags); - rcu_accelerate_cbs_unlocked(rnp, rdp); - rcu_nocb_unlock_irqrestore(rdp, flags); - } } } @@ -2791,24 +2789,6 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - /* - * On RT rcu_core() can be preempted when IRQs aren't disabled. - * Therefore this function can race with concurrent NOCB (de-)offloading - * on this CPU and the below condition must be considered volatile. - * However if we race with: - * - * _ Offloading: In the worst case we accelerate or process callbacks - * concurrently with NOCB kthreads. We are guaranteed to - * call rcu_nocb_lock() if that happens. - * - * _ Deoffloading: In the worst case we miss callbacks acceleration or - * processing. This is fine because the early stage - * of deoffloading invokes rcu_core() after setting - * SEGCBLIST_RCU_CORE. So we guarantee that we'll process - * what could have been dismissed without the need to wait - * for the next rcu_pending() check in the next jiffy. - */ - const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2828,17 +2808,17 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) { - rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { + local_irq_save(flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - rcu_nocb_unlock_irqrestore(rdp, flags); + local_irq_restore(flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && + if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) { rcu_do_batch(rdp); /* Re-invoke RCU core processing if there are callbacks remaining. */ @@ -2855,7 +2835,7 @@ static __latent_entropy void rcu_core(void) queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } -static void rcu_core_si(struct softirq_action *h) +static void rcu_core_si(void) { rcu_core(); } @@ -3227,7 +3207,7 @@ struct kvfree_rcu_bulk_data { struct list_head list; struct rcu_gp_oldstate gp_snap; unsigned long nr_records; - void *records[]; + void *records[] __counted_by(nr_records); }; /* @@ -3539,10 +3519,10 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) if (delayed_work_pending(&krcp->monitor_work)) { delay_left = krcp->monitor_work.timer.expires - jiffies; if (delay < delay_left) - mod_delayed_work(system_wq, &krcp->monitor_work, delay); + mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); return; } - queue_delayed_work(system_wq, &krcp->monitor_work, delay); + queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); } static void @@ -3584,18 +3564,15 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) } /* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * Return: %true if a work is queued, %false otherwise. */ -static void kfree_rcu_monitor(struct work_struct *work) +static bool +kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) { - struct kfree_rcu_cpu *krcp = container_of(work, - struct kfree_rcu_cpu, monitor_work.work); unsigned long flags; + bool queued = false; int i, j; - // Drain ready for reclaim. - kvfree_rcu_drain_ready(krcp); - raw_spin_lock_irqsave(&krcp->lock, flags); // Attempt to start a new batch. @@ -3634,11 +3611,27 @@ static void kfree_rcu_monitor(struct work_struct *work) // be that the work is in the pending state when // channels have been detached following by each // other. - queue_rcu_work(system_wq, &krwp->rcu_work); + queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); } } raw_spin_unlock_irqrestore(&krcp->lock, flags); + return queued; +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + struct kfree_rcu_cpu *krcp = container_of(work, + struct kfree_rcu_cpu, monitor_work.work); + + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + + // Queue a batch for a rest. + kvfree_rcu_queue_batch(krcp); // If there is nothing to detach, it means that our job is // successfully done here. In case of having at least one @@ -3704,7 +3697,7 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !atomic_xchg(&krcp->work_in_progress, 1)) { if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_wq, + queue_delayed_work(system_unbound_wq, &krcp->page_cache_work, msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); } else { @@ -3767,7 +3760,8 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, } // Finally insert and update the GP for this page. - bnode->records[bnode->nr_records++] = ptr; + bnode->nr_records++; + bnode->records[bnode->nr_records - 1] = ptr; get_state_synchronize_rcu_full(&bnode->gp_snap); atomic_inc(&(*krcp)->bulk_count[idx]); @@ -3859,6 +3853,86 @@ unlock_return: } EXPORT_SYMBOL_GPL(kvfree_call_rcu); +/** + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. + * + * Note that a single argument of kvfree_rcu() call has a slow path that + * triggers synchronize_rcu() following by freeing a pointer. It is done + * before the return from the function. Therefore for any single-argument + * call that will result in a kfree() to a cache that is to be destroyed + * during module exit, it is developer's responsibility to ensure that all + * such calls have returned before the call to kmem_cache_destroy(). + */ +void kvfree_rcu_barrier(void) +{ + struct kfree_rcu_cpu_work *krwp; + struct kfree_rcu_cpu *krcp; + bool queued; + int i, cpu; + + /* + * Firstly we detach objects and queue them over an RCU-batch + * for all CPUs. Finally queued works are flushed for each CPU. + * + * Please note. If there are outstanding batches for a particular + * CPU, those have to be finished first following by queuing a new. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * Check if this CPU has any objects which have been queued for a + * new GP completion. If not(means nothing to detach), we are done + * with it. If any batch is pending/running for this "krcp", below + * per-cpu flush_rcu_work() waits its completion(see last step). + */ + if (!need_offload_krc(krcp)) + continue; + + while (1) { + /* + * If we are not able to queue a new RCU work it means: + * - batches for this CPU are still in flight which should + * be flushed first and then repeat; + * - no objects to detach, because of concurrency. + */ + queued = kvfree_rcu_queue_batch(krcp); + + /* + * Bail out, if there is no need to offload this "krcp" + * anymore. As noted earlier it can run concurrently. + */ + if (queued || !need_offload_krc(krcp)) + break; + + /* There are ongoing batches. */ + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } + } + + /* + * Now we guarantee that all objects are flushed. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * A monitor work can drain ready to reclaim objects + * directly. Wait its completion if running or pending. + */ + cancel_delayed_work_sync(&krcp->monitor_work); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } +} +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); + static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -4403,6 +4477,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp) { unsigned long __maybe_unused s = rcu_state.barrier_sequence; + rhp->next = rhp; // Mark the callback as having been invoked. if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { rcu_barrier_trace(TPS("LastCB"), -1, s); complete(&rcu_state.barrier_completion); @@ -4804,8 +4879,8 @@ rcu_boot_init_percpu_data(int cpu) /* Set up local state, ensuring consistent view of global state. */ rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); INIT_WORK(&rdp->strict_work, strict_work_handler); - WARN_ON_ONCE(ct->dynticks_nesting != 1); - WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu))); + WARN_ON_ONCE(ct->nesting != 1); + WARN_ON_ONCE(rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; @@ -4898,7 +4973,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->blimit = blimit; - ct->dynticks_nesting = 1; /* CPU not up, no tearing. */ + ct->nesting = 1; /* CPU not up, no tearing. */ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* @@ -5058,7 +5133,7 @@ void rcutree_report_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; arch_spin_lock(&rcu_state.ofl_lock); - rcu_dynticks_eqs_online(); + rcu_watching_online(); raw_spin_lock(&rcu_state.barrier_lock); raw_spin_lock_rcu_node(rnp); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); @@ -5424,6 +5499,8 @@ static void __init rcu_init_one(void) while (i > rnp->grphi) rnp++; per_cpu_ptr(&rcu_data, i)->mynode = rnp; + per_cpu_ptr(&rcu_data, i)->barrier_head.next = + &per_cpu_ptr(&rcu_data, i)->barrier_head; rcu_boot_init_percpu_data(i); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fcf2b4aa3441..a9a811d9d7a3 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -206,7 +206,7 @@ struct rcu_data { long blimit; /* Upper limit on a processed batch */ /* 3) dynticks interface. */ - int dynticks_snap; /* Per-GP tracking for dynticks. */ + int watching_snap; /* Per-GP tracking for dynticks. */ bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ @@ -215,7 +215,7 @@ struct rcu_data { /* 4) rcu_barrier(), OOM callbacks, and expediting. */ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */ struct rcu_head barrier_head; - int exp_dynticks_snap; /* Double-check need for IPI. */ + int exp_watching_snap; /* Double-check need for IPI. */ /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -411,7 +411,6 @@ struct rcu_state { arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp; /* Synchronize offline with */ /* GP pre-initialization. */ - int nocb_is_setup; /* nocb is setup from boot */ /* synchronize_rcu() part. */ struct llist_head srs_next; /* request a GP users. */ @@ -420,6 +419,11 @@ struct rcu_state { struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; struct work_struct srs_cleanup_work; atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */ + +#ifdef CONFIG_RCU_NOCB_CPU + struct mutex nocb_mutex; /* Guards (de-)offloading */ + int nocb_is_setup; /* nocb is setup from boot */ +#endif }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4acd29d16fdb..fb664d3a01c9 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -7,6 +7,7 @@ * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ +#include <linux/console.h> #include <linux/lockdep.h> static void rcu_exp_handler(void *unused); @@ -376,11 +377,11 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) * post grace period updater's accesses is enforced by the * below acquire semantic. */ - snap = ct_dynticks_cpu_acquire(cpu); - if (rcu_dynticks_in_eqs(snap)) + snap = ct_rcu_watching_cpu_acquire(cpu); + if (rcu_watching_snap_in_eqs(snap)) mask_ofl_test |= mask; else - rdp->exp_dynticks_snap = snap; + rdp->exp_watching_snap = snap; } } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; @@ -400,7 +401,7 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) unsigned long mask = rdp->grpmask; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp, rdp->exp_dynticks_snap)) { + if (rcu_watching_snap_stopped_since(rdp, rdp->exp_watching_snap)) { mask_ofl_test |= mask; continue; } @@ -543,6 +544,67 @@ static bool synchronize_rcu_expedited_wait_once(long tlimit) } /* + * Print out an expedited RCU CPU stall warning message. + */ +static void synchronize_rcu_expedited_stall(unsigned long jiffies_start, unsigned long j) +{ + int cpu; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(); + + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected expedited stalls, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); + ndetected = 0; + rcu_for_each_leaf_node(rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp; + + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(&rcu_data, cpu); + pr_cont(" %d-%c%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); + } + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); + if (ndetected) { + pr_err("blocking rcu_node structures (internal RCU debug):"); + rcu_for_each_node_breadth_first(rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_exp_done_unlocked(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rnp) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(READ_ONCE(rnp->expmask) & mask)) + continue; + dump_cpu_task(cpu); + } + rcu_exp_print_detail_task_stall_rnp(rnp); + } +} + +/* * Wait for the expedited grace period to elapse, issuing any needed * RCU CPU stall warnings along the way. */ @@ -553,10 +615,8 @@ static void synchronize_rcu_expedited_wait(void) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; - int ndetected; struct rcu_data *rdp; struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(); unsigned long flags; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait")); @@ -590,59 +650,17 @@ static void synchronize_rcu_expedited_wait(void) return; if (rcu_stall_is_suppressed()) continue; + + nbcon_cpu_emergency_enter(); + j = jiffies; rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rcu_state.name); - ndetected = 0; - rcu_for_each_leaf_node(rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) { - struct rcu_data *rdp; - - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(READ_ONCE(rnp->expmask) & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], - "D."[!!data_race(rdp->cpu_no_qs.b.exp)]); - } - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - j - jiffies_start, rcu_state.expedited_sequence, - data_race(rnp_root->expmask), - ".T"[!!data_race(rnp_root->exp_tasks)]); - if (ndetected) { - pr_err("blocking rcu_node structures (internal RCU debug):"); - rcu_for_each_node_breadth_first(rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_exp_done_unlocked(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - data_race(rnp->expmask), - ".T"[!!data_race(rnp->exp_tasks)]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rnp) { - for_each_leaf_node_possible_cpu(rnp, cpu) { - mask = leaf_node_cpu_bit(rnp, cpu); - if (!(READ_ONCE(rnp->expmask) & mask)) - continue; - preempt_disable(); // For smp_processor_id() in dump_cpu_task(). - dump_cpu_task(cpu); - preempt_enable(); - } - rcu_exp_print_detail_task_stall_rnp(rnp); - } + synchronize_rcu_expedited_stall(jiffies_start, j); jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; + + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); } } diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 3ce30841119a..97b99cd06923 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -16,10 +16,6 @@ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return lockdep_is_held(&rdp->nocb_lock); -} static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) { @@ -220,7 +216,7 @@ static bool __wake_nocb_gp(struct rcu_data *rdp_gp, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); - wake_up_process(rdp_gp->nocb_gp_kthread); + swake_up_one_online(&rdp_gp->nocb_gp_wq); } return needwake; @@ -413,14 +409,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, return false; } - // In the process of (de-)offloading: no bypassing, but - // locking. - if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { - rcu_nocb_lock(rdp); - *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); - return false; /* Not offloaded, no bypassing. */ - } - // Don't use ->nocb_bypass during early boot. if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { rcu_nocb_lock(rdp); @@ -505,7 +493,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); } rcu_nocb_bypass_unlock(rdp); - smp_mb(); /* Order enqueue before wake. */ + // A wake up of the grace period kthread or timer adjustment // needs to be done only if: // 1. Bypass list was fully empty before (this is the first @@ -616,37 +604,33 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, } } -static int nocb_gp_toggle_rdp(struct rcu_data *rdp) +static void nocb_gp_toggle_rdp(struct rcu_data *rdp_gp, struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int ret; - rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + /* + * Locking orders future de-offloaded callbacks enqueue against previous + * handling of this rdp. Ie: Make sure rcuog is done with this rdp before + * deoffloaded callbacks can be enqueued. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { /* * Offloading. Set our flag and notify the offload worker. * We will handle this rdp until it ever gets de-offloaded. */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); - ret = 1; - } else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) && - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); + rcu_segcblist_set_flags(cblist, SEGCBLIST_OFFLOADED); + } else { /* * De-offloading. Clear our flag and notify the de-offload worker. * We will ignore this rdp until it ever gets re-offloaded. */ - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - ret = 0; - } else { - WARN_ON_ONCE(1); - ret = -1; + list_del(&rdp->nocb_entry_rdp); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_OFFLOADED); } - - rcu_nocb_unlock_irqrestore(rdp, flags); - - return ret; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } static void nocb_gp_sleep(struct rcu_data *my_rdp, int cpu) @@ -853,14 +837,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (rdp_toggling) { - int ret; - - ret = nocb_gp_toggle_rdp(rdp_toggling); - if (ret == 1) - list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp); - else if (ret == 0) - list_del(&rdp_toggling->nocb_entry_rdp); - + nocb_gp_toggle_rdp(my_rdp, rdp_toggling); swake_up_one(&rdp_toggling->nocb_state_wq); } @@ -917,7 +894,7 @@ static void nocb_cb_wait(struct rcu_data *rdp) WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); /* * Disable BH to provide the expected environment. Also, when @@ -1030,16 +1007,11 @@ void rcu_nocb_flush_deferred_wakeup(void) } EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); -static int rdp_offload_toggle(struct rcu_data *rdp, - bool offload, unsigned long flags) - __releases(rdp->nocb_lock) +static int rcu_nocb_queue_toggle_rdp(struct rcu_data *rdp) { - struct rcu_segcblist *cblist = &rdp->cblist; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; bool wake_gp = false; - - rcu_segcblist_offload(cblist, offload); - rcu_nocb_unlock_irqrestore(rdp, flags); + unsigned long flags; raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); // Queue this rdp for add/del to/from the list to iterate on rcuog @@ -1053,88 +1025,73 @@ static int rdp_offload_toggle(struct rcu_data *rdp, return wake_gp; } -static long rcu_nocb_rdp_deoffload(void *arg) +static bool rcu_nocb_rdp_deoffload_wait_cond(struct rcu_data *rdp) { - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; - int wake_gp; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + bool ret; /* - * rcu_nocb_rdp_deoffload() may be called directly if - * rcuog/o[p] spawn failed, because at this time the rdp->cpu - * is not online yet. + * Locking makes sure rcuog is done handling this rdp before deoffloaded + * enqueue can happen. Also it keeps the SEGCBLIST_OFFLOADED flag stable + * while the ->nocb_lock is held. */ - WARN_ON_ONCE((rdp->cpu != raw_smp_processor_id()) && cpu_online(rdp->cpu)); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = !rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + +static int rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + unsigned long flags; + int wake_gp; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + + /* CPU must be offline, unless it's early boot */ + WARN_ON_ONCE(cpu_online(rdp->cpu) && rdp->cpu != raw_smp_processor_id()); pr_info("De-offloading %d\n", rdp->cpu); - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Flush once and for all now. This suffices because we are - * running on the target CPU holding ->nocb_lock (thus having - * interrupts disabled), and because rdp_offload_toggle() - * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. - * Thus future calls to rcu_segcblist_completely_offloaded() will - * return false, which means that future calls to rcu_nocb_try_bypass() - * will refuse to put anything into the bypass. - */ - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); + /* Flush all callbacks from segcblist and bypass */ + rcu_barrier(); + /* - * Start with invoking rcu_core() early. This way if the current thread - * happens to preempt an ongoing call to rcu_core() in the middle, - * leaving some work dismissed because rcu_core() still thinks the rdp is - * completely offloaded, we are guaranteed a nearby future instance of - * rcu_core() to catch up. + * Make sure the rcuoc kthread isn't in the middle of a nocb locked + * sequence while offloading is deactivated, along with nocb locking. */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); - invoke_rcu_core(); - wake_gp = rdp_offload_toggle(rdp, false, flags); + if (rdp->nocb_cb_kthread) + kthread_park(rdp->nocb_cb_kthread); + + rcu_nocb_lock_irqsave(rdp, flags); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_nocb_unlock_irqrestore(rdp, flags); + + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); mutex_lock(&rdp_gp->nocb_gp_kthread_mutex); + if (rdp_gp->nocb_gp_kthread) { if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, - SEGCBLIST_KTHREAD_GP)); - if (rdp->nocb_cb_kthread) - kthread_park(rdp->nocb_cb_kthread); + rcu_nocb_rdp_deoffload_wait_cond(rdp)); } else { /* * No kthread to clear the flags for us or remove the rdp from the nocb list * to iterate. Do it here instead. Locking doesn't look stricly necessary * but we stick to paranoia in this rare path. */ - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - rcu_nocb_unlock_irqrestore(rdp, flags); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); list_del(&rdp->nocb_entry_rdp); } - mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - - /* - * Lock one last time to acquire latest callback updates from kthreads - * so we can later handle callbacks locally without locking. - */ - rcu_nocb_lock_irqsave(rdp, flags); - /* - * Theoretically we could clear SEGCBLIST_LOCKING after the nocb - * lock is released but how about being paranoid for once? - */ - rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING); - /* - * Without SEGCBLIST_LOCKING, we can't use - * rcu_nocb_unlock_irqrestore() anymore. - */ - raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); - - /* Sanity check */ - WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); return 0; } @@ -1145,33 +1102,42 @@ int rcu_nocb_cpu_deoffload(int cpu) int ret = 0; cpus_read_lock(); - mutex_lock(&rcu_state.barrier_mutex); + mutex_lock(&rcu_state.nocb_mutex); if (rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_deoffload(rdp); if (!ret) cpumask_clear_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu); + pr_info("NOCB: Cannot CB-deoffload online CPU %d\n", rdp->cpu); ret = -EINVAL; } } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); -static long rcu_nocb_rdp_offload(void *arg) +static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) { - struct rcu_data *rdp = arg; - struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; + bool ret; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + ret = rcu_segcblist_test_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + + return ret; +} + +static int rcu_nocb_rdp_offload(struct rcu_data *rdp) +{ int wake_gp; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + WARN_ON_ONCE(cpu_online(rdp->cpu)); /* * For now we only support re-offload, ie: the rdp must have been * offloaded on boot first. @@ -1184,44 +1150,17 @@ static long rcu_nocb_rdp_offload(void *arg) pr_info("Offloading %d\n", rdp->cpu); - /* - * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING - * is set. - */ - raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(&rdp->cblist)); - /* - * We didn't take the nocb lock while working on the - * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode). - * Every modifications that have been done previously on - * rdp->cblist must be visible remotely by the nocb kthreads - * upon wake up after reading the cblist flags. - * - * The layout against nocb_lock enforces that ordering: - * - * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() - * ------------------------- ---------------------------- - * WRITE callbacks rcu_nocb_lock() - * rcu_nocb_lock() READ flags - * WRITE flags READ callbacks - * rcu_nocb_unlock() rcu_nocb_unlock() - */ - wake_gp = rdp_offload_toggle(rdp, true, flags); + wake_gp = rcu_nocb_queue_toggle_rdp(rdp); if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); - kthread_unpark(rdp->nocb_cb_kthread); - swait_event_exclusive(rdp->nocb_state_wq, - rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_nocb_rdp_offload_wait_cond(rdp)); - /* - * All kthreads are ready to work, we can finally relieve rcu_core() and - * enable nocb bypass. - */ - rcu_nocb_lock_irqsave(rdp, flags); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); - rcu_nocb_unlock_irqrestore(rdp, flags); + kthread_unpark(rdp->nocb_cb_kthread); return 0; } @@ -1232,18 +1171,18 @@ int rcu_nocb_cpu_offload(int cpu) int ret = 0; cpus_read_lock(); - mutex_lock(&rcu_state.barrier_mutex); + mutex_lock(&rcu_state.nocb_mutex); if (!rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) { - ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + if (!cpu_online(cpu)) { + ret = rcu_nocb_rdp_offload(rdp); if (!ret) cpumask_set_cpu(cpu, rcu_nocb_mask); } else { - pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu); + pr_info("NOCB: Cannot CB-offload online CPU %d\n", rdp->cpu); ret = -EINVAL; } } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); cpus_read_unlock(); return ret; @@ -1261,7 +1200,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) return 0; /* Protect rcu_nocb_mask against concurrent (de-)offloading. */ - if (!mutex_trylock(&rcu_state.barrier_mutex)) + if (!mutex_trylock(&rcu_state.nocb_mutex)) return 0; /* Snapshot count of all CPUs */ @@ -1271,7 +1210,7 @@ lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) count += READ_ONCE(rdp->lazy_len); } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); return count ? count : SHRINK_EMPTY; } @@ -1289,9 +1228,9 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) * Protect against concurrent (de-)offloading. Otherwise nocb locking * may be ignored or imbalanced. */ - if (!mutex_trylock(&rcu_state.barrier_mutex)) { + if (!mutex_trylock(&rcu_state.nocb_mutex)) { /* - * But really don't insist if barrier_mutex is contended since we + * But really don't insist if nocb_mutex is contended since we * can't guarantee that it will never engage in a dependency * chain involving memory allocation. The lock is seldom contended * anyway. @@ -1330,7 +1269,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) break; } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); return count ? count : SHRINK_STOP; } @@ -1396,9 +1335,7 @@ void __init rcu_init_nohz(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); - rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_OFFLOADED); } rcu_organize_nocb_kthreads(); } @@ -1446,7 +1383,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) "rcuog/%d", rdp_gp->cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) { mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex); - goto end; + goto err; } WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); if (kthread_prio) @@ -1458,7 +1395,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) t = kthread_create(rcu_nocb_cb_kthread, rdp, "rcuo%c/%d", rcu_state.abbr, cpu); if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) - goto end; + goto err; if (rcu_rdp_is_offloaded(rdp)) wake_up_process(t); @@ -1471,13 +1408,21 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu) WRITE_ONCE(rdp->nocb_cb_kthread, t); WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); return; -end: - mutex_lock(&rcu_state.barrier_mutex); + +err: + /* + * No need to protect against concurrent rcu_barrier() + * because the number of callbacks should be 0 for a non-boot CPU, + * therefore rcu_barrier() shouldn't even try to grab the nocb_lock. + * But hold nocb_mutex to avoid nocb_lock imbalance from shrinker. + */ + WARN_ON_ONCE(system_state > SYSTEM_BOOTING && rcu_segcblist_n_cbs(&rdp->cblist)); + mutex_lock(&rcu_state.nocb_mutex); if (rcu_rdp_is_offloaded(rdp)) { rcu_nocb_rdp_deoffload(rdp); cpumask_clear_cpu(cpu, rcu_nocb_mask); } - mutex_unlock(&rcu_state.barrier_mutex); + mutex_unlock(&rcu_state.nocb_mutex); } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ @@ -1653,16 +1598,6 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) -{ - return 0; -} - -static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) -{ - return false; -} - /* No ->nocb_lock to acquire. */ static void rcu_nocb_lock(struct rcu_data *rdp) { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c569da65b421..1c7cbd145d5e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -24,10 +24,11 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) * timers have their own means of synchronization against the * offloaded state updaters. */ - RCU_LOCKDEP_WARN( + RCU_NOCB_LOCKDEP_WARN( !(lockdep_is_held(&rcu_state.barrier_mutex) || (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || - rcu_lockdep_is_held_nocb(rdp) || + lockdep_is_held(&rdp->nocb_lock) || + lockdep_is_held(&rcu_state.nocb_mutex) || (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) && rdp == this_cpu_ptr(&rcu_data)) || rcu_current_is_nocb_kthread(rdp)), @@ -869,7 +870,7 @@ static void rcu_qs(void) /* * Register an urgently needed quiescent state. If there is an - * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight + * emergency, invoke rcu_momentary_eqs() to do a heavy-weight * dyntick-idle quiescent state visible to other CPUs, which will in * some cases serve for expedited as well as normal grace periods. * Either way, register a lightweight quiescent state. @@ -889,7 +890,7 @@ void rcu_all_qs(void) this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { local_irq_save(flags); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); local_irq_restore(flags); } rcu_qs(); @@ -909,7 +910,7 @@ void rcu_note_context_switch(bool preempt) goto out; this_cpu_write(rcu_data.rcu_urgent_qs, false); if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); out: rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 4b0e9d7c4c68..4432db6d0b99 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -7,8 +7,10 @@ * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ +#include <linux/console.h> #include <linux/kvm_para.h> #include <linux/rcu_notifier.h> +#include <linux/smp.h> ////////////////////////////////////////////////////////////////////////////// // @@ -370,6 +372,7 @@ static void rcu_dump_cpu_stacks(void) struct rcu_node *rnp; rcu_for_each_leaf_node(rnp) { + printk_deferred_enter(); raw_spin_lock_irqsave_rcu_node(rnp, flags); for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { @@ -379,6 +382,7 @@ static void rcu_dump_cpu_stacks(void) dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + printk_deferred_exit(); } } @@ -501,7 +505,7 @@ static void print_cpu_stall_info(int cpu) } delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && - rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)); + rcu_watching_snap_in_eqs(ct_rcu_watching_cpu(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) // Print signed value, as negative values indicate a probable bug. @@ -515,8 +519,8 @@ static void print_cpu_stall_info(int cpu) rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : "!."[!delta], ticks_value, ticks_title, - ct_dynticks_cpu(cpu) & 0xffff, - ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu), + ct_rcu_watching_cpu(cpu) & 0xffff, + ct_nesting_cpu(cpu), ct_nmi_nesting_cpu(cpu), rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, rcuc_starved ? buf : "", @@ -605,6 +609,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) if (rcu_stall_is_suppressed()) return; + nbcon_cpu_emergency_enter(); + /* * OK, time to rat on our buddy... * See Documentation/RCU/stallwarn.rst for info on how to debug @@ -657,6 +663,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); rcu_force_quiescent_state(); /* Kick them all. */ @@ -677,6 +685,8 @@ static void print_cpu_stall(unsigned long gps) if (rcu_stall_is_suppressed()) return; + nbcon_cpu_emergency_enter(); + /* * OK, time to rat on ourselves... * See Documentation/RCU/stallwarn.rst for info on how to debug @@ -706,6 +716,8 @@ static void print_cpu_stall(unsigned long gps) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + nbcon_cpu_emergency_exit(); + panic_on_rcu_stall(); /* @@ -719,6 +731,9 @@ static void print_cpu_stall(unsigned long gps) set_preempt_need_resched(); } +static bool csd_lock_suppress_rcu_stall; +module_param(csd_lock_suppress_rcu_stall, bool, 0644); + static void check_cpu_stall(struct rcu_data *rdp) { bool self_detected; @@ -791,7 +806,9 @@ static void check_cpu_stall(struct rcu_data *rdp) return; rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); - if (self_detected) { + if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { + pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); + } else if (self_detected) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); } else { diff --git a/kernel/resource.c b/kernel/resource.c index 14777afb0a99..b730bd28b422 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -450,8 +450,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg, /* re-alloc */ struct resource *rams_new; - rams_new = kvrealloc(rams, rams_size * sizeof(struct resource), - (rams_size + 16) * sizeof(struct resource), + rams_new = kvrealloc(rams, (rams_size + 16) * sizeof(struct resource), GFP_KERNEL); if (!rams_new) goto out; @@ -540,20 +539,62 @@ static int __region_intersects(struct resource *parent, resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - struct resource res; + resource_size_t ostart, oend; int type = 0; int other = 0; - struct resource *p; + struct resource *p, *dp; + bool is_type, covered; + struct resource res; res.start = start; res.end = start + size - 1; for (p = parent->child; p ; p = p->sibling) { - bool is_type = (((p->flags & flags) == flags) && - ((desc == IORES_DESC_NONE) || - (desc == p->desc))); - - if (resource_overlaps(p, &res)) - is_type ? type++ : other++; + if (!resource_overlaps(p, &res)) + continue; + is_type = (p->flags & flags) == flags && + (desc == IORES_DESC_NONE || desc == p->desc); + if (is_type) { + type++; + continue; + } + /* + * Continue to search in descendant resources as if the + * matched descendant resources cover some ranges of 'p'. + * + * |------------- "CXL Window 0" ------------| + * |-- "System RAM" --| + * + * will behave similar as the following fake resource + * tree when searching "System RAM". + * + * |-- "System RAM" --||-- "CXL Window 0a" --| + */ + covered = false; + ostart = max(res.start, p->start); + oend = min(res.end, p->end); + for_each_resource(p, dp, false) { + if (!resource_overlaps(dp, &res)) + continue; + is_type = (dp->flags & flags) == flags && + (desc == IORES_DESC_NONE || desc == dp->desc); + if (is_type) { + type++; + /* + * Range from 'ostart' to 'dp->start' + * isn't covered by matched resource. + */ + if (dp->start > ostart) + break; + if (dp->end >= oend) { + covered = true; + break; + } + /* Remove covered range */ + ostart = max(ostart, dp->end + 1); + } + } + if (!covered) + other++; } if (type == 0) @@ -1818,7 +1859,11 @@ EXPORT_SYMBOL(resource_list_free); #ifdef CONFIG_GET_FREE_REGION #define GFR_DESCENDING (1UL << 0) #define GFR_REQUEST_REGION (1UL << 1) -#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) +#ifdef PA_SECTION_SHIFT +#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) +#else +#define GFR_DEFAULT_ALIGN PAGE_SIZE +#endif static resource_size_t gfr_start(struct resource *base, resource_size_t size, resource_size_t align, unsigned long flags) @@ -1826,12 +1871,11 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size, if (flags & GFR_DESCENDING) { resource_size_t end; - end = min_t(resource_size_t, base->end, - (1ULL << MAX_PHYSMEM_BITS) - 1); + end = min_t(resource_size_t, base->end, PHYSMEM_END); return end - size + 1; } - return ALIGN(base->start, align); + return ALIGN(max(base->start, align), align); } static bool gfr_continue(struct resource *base, resource_size_t addr, @@ -1844,8 +1888,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr, * @size did not wrap 0. */ return addr > addr - size && - addr <= min_t(resource_size_t, base->end, - (1ULL << MAX_PHYSMEM_BITS) - 1); + addr <= min_t(resource_size_t, base->end, PHYSMEM_END); } static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, @@ -2006,7 +2049,7 @@ struct resource *alloc_free_mem_region(struct resource *base, return get_free_mem_region(NULL, base, size, align, name, IORES_DESC_NONE, flags); } -EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL); +EXPORT_SYMBOL_GPL(alloc_free_mem_region); #endif /* CONFIG_GET_FREE_REGION */ static int __init strict_iomem(char *str) diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c index 0e509985a44a..42d2d8d20f5d 100644 --- a/kernel/resource_kunit.c +++ b/kernel/resource_kunit.c @@ -7,6 +7,8 @@ #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/string.h> +#include <linux/sizes.h> +#include <linux/mm.h> #define R0_START 0x0000 #define R0_END 0xffff @@ -137,9 +139,150 @@ static void resource_test_intersection(struct kunit *test) } while (++i < ARRAY_SIZE(results_for_intersection)); } +/* + * The test resource tree for region_intersects() test: + * + * BASE-BASE+1M-1 : Test System RAM 0 + * # hole 0 (BASE+1M-BASE+2M) + * BASE+2M-BASE+3M-1 : Test CXL Window 0 + * BASE+3M-BASE+4M-1 : Test System RAM 1 + * BASE+4M-BASE+7M-1 : Test CXL Window 1 + * BASE+4M-BASE+5M-1 : Test System RAM 2 + * BASE+4M+128K-BASE+4M+256K-1: Test Code + * BASE+5M-BASE+6M-1 : Test System RAM 3 + */ +#define RES_TEST_RAM0_OFFSET 0 +#define RES_TEST_RAM0_SIZE SZ_1M +#define RES_TEST_HOLE0_OFFSET (RES_TEST_RAM0_OFFSET + RES_TEST_RAM0_SIZE) +#define RES_TEST_HOLE0_SIZE SZ_1M +#define RES_TEST_WIN0_OFFSET (RES_TEST_HOLE0_OFFSET + RES_TEST_HOLE0_SIZE) +#define RES_TEST_WIN0_SIZE SZ_1M +#define RES_TEST_RAM1_OFFSET (RES_TEST_WIN0_OFFSET + RES_TEST_WIN0_SIZE) +#define RES_TEST_RAM1_SIZE SZ_1M +#define RES_TEST_WIN1_OFFSET (RES_TEST_RAM1_OFFSET + RES_TEST_RAM1_SIZE) +#define RES_TEST_WIN1_SIZE (SZ_1M * 3) +#define RES_TEST_RAM2_OFFSET RES_TEST_WIN1_OFFSET +#define RES_TEST_RAM2_SIZE SZ_1M +#define RES_TEST_CODE_OFFSET (RES_TEST_RAM2_OFFSET + SZ_128K) +#define RES_TEST_CODE_SIZE SZ_128K +#define RES_TEST_RAM3_OFFSET (RES_TEST_RAM2_OFFSET + RES_TEST_RAM2_SIZE) +#define RES_TEST_RAM3_SIZE SZ_1M +#define RES_TEST_TOTAL_SIZE ((RES_TEST_WIN1_OFFSET + RES_TEST_WIN1_SIZE)) + +static void remove_free_resource(void *ctx) +{ + struct resource *res = (struct resource *)ctx; + + remove_resource(res); + kfree(res); +} + +static void resource_test_request_region(struct kunit *test, struct resource *parent, + resource_size_t start, resource_size_t size, + const char *name, unsigned long flags) +{ + struct resource *res; + + res = __request_region(parent, start, size, name, flags); + KUNIT_ASSERT_NOT_NULL(test, res); + kunit_add_action_or_reset(test, remove_free_resource, res); +} + +static void resource_test_insert_resource(struct kunit *test, struct resource *parent, + resource_size_t start, resource_size_t size, + const char *name, unsigned long flags) +{ + struct resource *res; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, res); + + res->name = name; + res->start = start; + res->end = start + size - 1; + res->flags = flags; + if (insert_resource(parent, res)) { + kfree(res); + KUNIT_FAIL_AND_ABORT(test, "Fail to insert resource %pR\n", res); + } + + kunit_add_action_or_reset(test, remove_free_resource, res); +} + +static void resource_test_region_intersects(struct kunit *test) +{ + unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + struct resource *parent; + resource_size_t start; + + /* Find an iomem_resource hole to hold test resources */ + parent = alloc_free_mem_region(&iomem_resource, RES_TEST_TOTAL_SIZE, SZ_1M, + "test resources"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + start = parent->start; + kunit_add_action_or_reset(test, remove_free_resource, parent); + + resource_test_request_region(test, parent, start + RES_TEST_RAM0_OFFSET, + RES_TEST_RAM0_SIZE, "Test System RAM 0", flags); + resource_test_insert_resource(test, parent, start + RES_TEST_WIN0_OFFSET, + RES_TEST_WIN0_SIZE, "Test CXL Window 0", + IORESOURCE_MEM); + resource_test_request_region(test, parent, start + RES_TEST_RAM1_OFFSET, + RES_TEST_RAM1_SIZE, "Test System RAM 1", flags); + resource_test_insert_resource(test, parent, start + RES_TEST_WIN1_OFFSET, + RES_TEST_WIN1_SIZE, "Test CXL Window 1", + IORESOURCE_MEM); + resource_test_request_region(test, parent, start + RES_TEST_RAM2_OFFSET, + RES_TEST_RAM2_SIZE, "Test System RAM 2", flags); + resource_test_insert_resource(test, parent, start + RES_TEST_CODE_OFFSET, + RES_TEST_CODE_SIZE, "Test Code", flags); + resource_test_request_region(test, parent, start + RES_TEST_RAM3_OFFSET, + RES_TEST_RAM3_SIZE, "Test System RAM 3", flags); + kunit_release_action(test, remove_free_resource, parent); + + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM0_OFFSET, PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM0_OFFSET + + RES_TEST_RAM0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_DISJOINT, + region_intersects(start + RES_TEST_HOLE0_OFFSET, PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_DISJOINT, + region_intersects(start + RES_TEST_HOLE0_OFFSET + + RES_TEST_HOLE0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_MIXED, + region_intersects(start + RES_TEST_WIN0_OFFSET + + RES_TEST_WIN0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM1_OFFSET + + RES_TEST_RAM1_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM2_OFFSET + + RES_TEST_RAM2_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_CODE_OFFSET, PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM2_OFFSET, + RES_TEST_RAM2_SIZE + PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_MIXED, + region_intersects(start + RES_TEST_RAM3_OFFSET, + RES_TEST_RAM3_SIZE + PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); +} + static struct kunit_case resource_test_cases[] = { KUNIT_CASE(resource_test_union), KUNIT_CASE(resource_test_intersection), + KUNIT_CASE(resource_test_region_intersects), {} }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3951e4a55e5..c1a08035ea43 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -163,7 +163,10 @@ static inline int __task_prio(const struct task_struct *p) if (p->sched_class == &stop_sched_class) /* trumps deadline */ return -2; - if (rt_prio(p->prio)) /* includes deadline */ + if (p->dl_server) + return -1; /* deadline */ + + if (rt_or_dl_prio(p->prio)) return p->prio; /* [-1, 99] */ if (p->sched_class == &idle_sched_class) @@ -192,8 +195,24 @@ static inline bool prio_less(const struct task_struct *a, if (-pb < -pa) return false; - if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ - return !dl_time_before(a->dl.deadline, b->dl.deadline); + if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ + const struct sched_dl_entity *a_dl, *b_dl; + + a_dl = &a->dl; + /* + * Since,'a' and 'b' can be CFS tasks served by DL server, + * __task_prio() can return -1 (for DL) even for those. In that + * case, get to the dl_server's DL entity. + */ + if (a->dl_server) + a_dl = a->dl_server; + + b_dl = &b->dl; + if (b->dl_server) + b_dl = b->dl_server; + + return !dl_time_before(a_dl->deadline, b_dl->deadline); + } if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); @@ -240,6 +259,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) void sched_core_enqueue(struct rq *rq, struct task_struct *p) { + if (p->se.sched_delayed) + return; + rq->core->core_task_seq++; if (!p->core_cookie) @@ -250,6 +272,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { + if (p->se.sched_delayed) + return; + rq->core->core_task_seq++; if (sched_core_enqueued(p)) { @@ -1269,7 +1294,7 @@ bool sched_can_stop_tick(struct rq *rq) * dequeued by migrating while the constrained task continues to run. * E.g. going from 2->1 without going through pick_next_task(). */ - if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { + if (__need_bw_check(rq, rq->curr)) { if (cfs_task_bw_constrained(rq->curr)) return false; } @@ -1672,6 +1697,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) if (unlikely(!p->sched_class->uclamp_enabled)) return; + if (p->se.sched_delayed) + return; + for_each_clamp_id(clamp_id) uclamp_rq_inc_id(rq, p, clamp_id); @@ -1696,6 +1724,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) if (unlikely(!p->sched_class->uclamp_enabled)) return; + if (p->se.sched_delayed) + return; + for_each_clamp_id(clamp_id) uclamp_rq_dec_id(rq, p, clamp_id); } @@ -1975,14 +2006,21 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); } - uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + /* + * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear + * ->sched_delayed. + */ + uclamp_rq_inc(rq, p); if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); } -void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +/* + * Must only return false when DEQUEUE_SLEEP. + */ +inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) { if (sched_core_enabled(rq)) sched_core_dequeue(rq, p, flags); @@ -1995,8 +2033,12 @@ void dequeue_task(struct rq *rq, struct task_struct *p, int flags) psi_dequeue(p, flags & DEQUEUE_SLEEP); } + /* + * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' + * and mark the task ->sched_delayed. + */ uclamp_rq_dec(rq, p); - p->sched_class->dequeue_task(rq, p, flags); + return p->sched_class->dequeue_task(rq, p, flags); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -2014,12 +2056,25 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { - WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); + SCHED_WARN_ON(flags & DEQUEUE_SLEEP); + + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ASSERT_EXCLUSIVE_WRITER(p->on_rq); + /* + * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* + * dequeue_task() and cleared *after* enqueue_task(). + */ + dequeue_task(rq, p, flags); } +static void block_task(struct rq *rq, struct task_struct *p, int flags) +{ + if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) + __block_task(rq, p); +} + /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. @@ -2233,6 +2288,12 @@ void migrate_disable(void) struct task_struct *p = current; if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif p->migration_disabled++; return; } @@ -2251,14 +2312,20 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + if (p->migration_disabled > 1) { p->migration_disabled--; return; } - if (WARN_ON_ONCE(!p->migration_disabled)) - return; - /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). @@ -3607,8 +3674,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } #endif - - p->dl_server = NULL; } /* @@ -3644,12 +3709,14 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { + update_rq_clock(rq); + if (p->se.sched_delayed) + enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); if (!task_on_cpu(rq, p)) { /* * When on_rq && !on_cpu the task is preempted, see if * it should preempt the task that is current now. */ - update_rq_clock(rq); wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); @@ -4029,11 +4096,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * case the whole 'p->on_rq && ttwu_runnable()' case below * without taking any locks. * + * Specifically, given current runs ttwu() we must be before + * schedule()'s block_task(), as such this must not observe + * sched_delayed. + * * In particular: * - we rely on Program-Order guarantees for all the ordering, * - we're serialized against set_special_state() by virtue of * it disabling IRQs (this allows not taking ->pi_lock). */ + SCHED_WARN_ON(p->se.sched_delayed); if (!ttwu_state_match(p, state, &success)) goto out; @@ -4322,9 +4394,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; p->se.vlag = 0; - p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); + /* A delayed task cannot be in clone(). */ + SCHED_WARN_ON(p->se.sched_delayed); + #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; #endif @@ -4572,6 +4646,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); + p->se.custom_slice = 0; + p->se.slice = sysctl_sched_base_slice; /* * We don't need the reset flag anymore after the fork. It has @@ -4686,7 +4762,7 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(p); - activate_task(rq, p, ENQUEUE_NOCLOCK); + activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -5762,15 +5838,15 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); - SCHED_WARN_ON(ct_state() == CONTEXT_USER); + SCHED_WARN_ON(ct_state() == CT_STATE_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq()->sched_count); } -static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) +static void prev_balance(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) { #ifdef CONFIG_SMP const struct sched_class *class; @@ -5787,8 +5863,6 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, break; } #endif - - put_prev_task(rq, prev); } /* @@ -5800,6 +5874,8 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) const struct sched_class *class; struct task_struct *p; + rq->dl_server = NULL; + /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a @@ -5815,35 +5891,28 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assume the next prioritized class is idle_sched_class */ if (!p) { - put_prev_task(rq, prev); - p = pick_next_task_idle(rq); + p = pick_task_idle(rq); + put_prev_set_next_task(rq, prev, p); } - /* - * This is the fast path; it cannot be a DL server pick; - * therefore even if @p == @prev, ->dl_server must be NULL. - */ - if (p->dl_server) - p->dl_server = NULL; - return p; } restart: - put_prev_task_balance(rq, prev, rf); - - /* - * We've updated @prev and no longer need the server link, clear it. - * Must be done before ->pick_next_task() because that can (re)set - * ->dl_server. - */ - if (prev->dl_server) - prev->dl_server = NULL; + prev_balance(rq, prev, rf); for_each_class(class) { - p = class->pick_next_task(rq); - if (p) - return p; + if (class->pick_next_task) { + p = class->pick_next_task(rq, prev); + if (p) + return p; + } else { + p = class->pick_task(rq); + if (p) { + put_prev_set_next_task(rq, prev, p); + return p; + } + } } BUG(); /* The idle class should always have a runnable task. */ @@ -5873,6 +5942,8 @@ static inline struct task_struct *pick_task(struct rq *rq) const struct sched_class *class; struct task_struct *p; + rq->dl_server = NULL; + for_each_class(class) { p = class->pick_task(rq); if (p) @@ -5911,6 +5982,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * another cpu during offline. */ rq->core_pick = NULL; + rq->core_dl_server = NULL; return __pick_next_task(rq, prev, rf); } @@ -5929,16 +6001,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); next = rq->core_pick; - if (next != prev) { - put_prev_task(rq, prev); - set_next_task(rq, next); - } - + rq->dl_server = rq->core_dl_server; rq->core_pick = NULL; - goto out; + rq->core_dl_server = NULL; + goto out_set_next; } - put_prev_task_balance(rq, prev, rf); + prev_balance(rq, prev, rf); smt_mask = cpu_smt_mask(cpu); need_sync = !!rq->core->core_cookie; @@ -5979,6 +6048,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) next = pick_task(rq); if (!next->core_cookie) { rq->core_pick = NULL; + rq->core_dl_server = NULL; /* * For robustness, update the min_vruntime_fi for * unconstrained picks as well. @@ -6006,7 +6076,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); - p = rq_i->core_pick = pick_task(rq_i); + rq_i->core_pick = p = pick_task(rq_i); + rq_i->core_dl_server = rq_i->dl_server; + if (!max || prio_less(max, p, fi_before)) max = p; } @@ -6030,6 +6102,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } rq_i->core_pick = p; + rq_i->core_dl_server = NULL; if (p == rq_i->idle) { if (rq_i->nr_running) { @@ -6090,6 +6163,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i == cpu) { rq_i->core_pick = NULL; + rq_i->core_dl_server = NULL; continue; } @@ -6098,6 +6172,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (rq_i->curr == rq_i->core_pick) { rq_i->core_pick = NULL; + rq_i->core_dl_server = NULL; continue; } @@ -6105,8 +6180,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } out_set_next: - set_next_task(rq, next); -out: + put_prev_set_next_task(rq, prev, next); if (rq->core->core_forceidle_count && next == rq->idle) queue_core_balance(rq); @@ -6342,19 +6416,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * Constants for the sched_mode argument of __schedule(). * * The mode argument allows RT enabled kernels to differentiate a - * preemption from blocking on an 'sleeping' spin/rwlock. Note that - * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to - * optimize the AND operation out and just check for zero. + * preemption from blocking on an 'sleeping' spin/rwlock. */ -#define SM_NONE 0x0 -#define SM_PREEMPT 0x1 -#define SM_RTLOCK_WAIT 0x2 - -#ifndef CONFIG_PREEMPT_RT -# define SM_MASK_PREEMPT (~0U) -#else -# define SM_MASK_PREEMPT SM_PREEMPT -#endif +#define SM_IDLE (-1) +#define SM_NONE 0 +#define SM_PREEMPT 1 +#define SM_RTLOCK_WAIT 2 /* * __schedule() is the main scheduler function. @@ -6395,9 +6462,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(unsigned int sched_mode) +static void __sched notrace __schedule(int sched_mode) { struct task_struct *prev, *next; + /* + * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted + * as a preemption by schedule_debug() and RCU. + */ + bool preempt = sched_mode > SM_NONE; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6408,13 +6480,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) rq = cpu_rq(cpu); prev = rq->curr; - schedule_debug(prev, !!sched_mode); + schedule_debug(prev, preempt); if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(!!sched_mode); + rcu_note_context_switch(preempt); /* * Make sure that signal_pending_state()->signal_pending() below @@ -6443,22 +6515,32 @@ static void __sched notrace __schedule(unsigned int sched_mode) switch_count = &prev->nivcsw; + /* Task state changes only considers SM_PREEMPT as preemption */ + preempt = sched_mode == SM_PREEMPT; + /* * We must load prev->state once (task_struct::state is volatile), such * that we form a control dependency vs deactivate_task() below. */ prev_state = READ_ONCE(prev->__state); - if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { + if (sched_mode == SM_IDLE) { + if (!rq->nr_running) { + next = prev; + goto picked; + } + } else if (!preempt && prev_state) { if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNING); } else { + int flags = DEQUEUE_NOCLOCK; + prev->sched_contributes_to_load = (prev_state & TASK_UNINTERRUPTIBLE) && !(prev_state & TASK_NOLOAD) && !(prev_state & TASK_FROZEN); - if (prev->sched_contributes_to_load) - rq->nr_uninterruptible++; + if (unlikely(is_special_task_state(prev_state))) + flags |= DEQUEUE_SPECIAL; /* * __schedule() ttwu() @@ -6471,17 +6553,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) * * After this, schedule() must not care about p->state any more. */ - deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); - - if (prev->in_iowait) { - atomic_inc(&rq->nr_iowait); - delayacct_blkio_start(); - } + block_task(rq, prev, flags); } switch_count = &prev->nvcsw; } next = pick_next_task(rq, prev, &rf); +picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG @@ -6523,7 +6601,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); + trace_sched_switch(preempt, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); @@ -6599,7 +6677,7 @@ static void sched_update_worker(struct task_struct *tsk) } } -static __always_inline void __schedule_loop(unsigned int sched_mode) +static __always_inline void __schedule_loop(int sched_mode) { do { preempt_disable(); @@ -6644,7 +6722,7 @@ void __sched schedule_idle(void) */ WARN_ON_ONCE(current->__state); do { - __schedule(SM_NONE); + __schedule(SM_IDLE); } while (need_resched()); } @@ -6658,7 +6736,7 @@ asmlinkage __visible void __sched schedule_user(void) * we find a better solution. * * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != CONTEXT_USER, but that will trigger + * should warn if prev_state != CT_STATE_USER, but that will trigger * too frequently to make sense yet. */ enum ctx_state prev_state = exception_enter(); @@ -7405,7 +7483,7 @@ EXPORT_SYMBOL(io_schedule); void sched_show_task(struct task_struct *p) { - unsigned long free = 0; + unsigned long free; int ppid; if (!try_get_task_stack(p)) @@ -7415,9 +7493,7 @@ void sched_show_task(struct task_struct *p) if (task_is_running(p)) pr_cont(" running task "); -#ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); -#endif ppid = 0; rcu_read_lock(); if (pid_alive(p)) @@ -8228,8 +8304,6 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } - init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_SMP init_defrootdomain(); #endif @@ -8284,8 +8358,13 @@ void __init sched_init(void) init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED + /* + * This is required for init cpu because rt.c:__enable_runtime() + * starts working after scheduler_running, which is not the case + * yet. + */ + rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif #ifdef CONFIG_SMP @@ -8317,10 +8396,12 @@ void __init sched_init(void) #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + fair_server_init(rq); #ifdef CONFIG_SCHED_CORE rq->core = rq; rq->core_pick = NULL; + rq->core_dl_server = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; rq->core_forceidle_count = 0; @@ -8333,6 +8414,7 @@ void __init sched_init(void) } set_load_weight(&init_task, false); + init_task.se.slice = sysctl_sched_base_slice, /* * The boot idle thread does lazy MMU switching as well: @@ -8548,7 +8630,7 @@ void normalize_rt_tasks(void) schedstat_set(p->stats.sleep_start, 0); schedstat_set(p->stats.block_start, 0); - if (!dl_task(p) && !rt_task(p)) { + if (!rt_or_dl_task(p)) { /* * Renice negative nice level userspace * tasks back to 0: @@ -9752,7 +9834,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { void dump_cpu_task(int cpu) { - if (cpu == smp_processor_id() && in_hardirq()) { + if (in_hardirq() && cpu == smp_processor_id()) { struct pt_regs *regs; regs = get_irq_regs(); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index eece6244f9d2..43111a515a28 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -654,9 +654,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) * Fake (unused) bandwidth; workaround to "fix" * priority inheritance. */ - .sched_runtime = 1000000, - .sched_deadline = 10000000, - .sched_period = 10000000, + .sched_runtime = NSEC_PER_MSEC, + .sched_deadline = 10 * NSEC_PER_MSEC, + .sched_period = 10 * NSEC_PER_MSEC, }; struct cpufreq_policy *policy = sg_policy->policy; int ret; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f59e5c19d944..9ce93d0bf452 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -320,19 +320,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -static void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw) { - struct rq *rq; - - WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); - - if (task_on_rq_queued(p)) - return; + if (dl_se->dl_non_contending) { + sub_running_bw(dl_se, &rq->dl); + dl_se->dl_non_contending = 0; - rq = task_rq(p); - if (p->dl.dl_non_contending) { - sub_running_bw(&p->dl, &rq->dl); - p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the * timer cannot be canceled, inactive_task_timer() @@ -340,13 +333,25 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } - __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __sub_rq_bw(dl_se->dl_bw, &rq->dl); __add_rq_bw(new_bw, &rq->dl); } +static void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); + + if (task_on_rq_queued(p)) + return; + + dl_rq_change_utilization(task_rq(p), &p->dl, new_bw); +} + static void __dl_clear_params(struct sched_dl_entity *dl_se); /* @@ -771,6 +776,15 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, /* for non-boosted task, pi_of(dl_se) == dl_se */ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; dl_se->runtime = pi_of(dl_se)->dl_runtime; + + /* + * If it is a deferred reservation, and the server + * is not handling an starvation case, defer it. + */ + if (dl_se->dl_defer & !dl_se->dl_defer_running) { + dl_se->dl_throttled = 1; + dl_se->dl_defer_armed = 1; + } } /* @@ -809,6 +823,9 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) replenish_dl_new_period(dl_se, rq); } +static int start_dl_timer(struct sched_dl_entity *dl_se); +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); + /* * Pure Earliest Deadline First (EDF) scheduling does not deal with the * possibility of a entity lasting more than what it declared, and thus @@ -837,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. + * + * Or, it could be the case of a deferred reservation that + * was not able to consume its runtime in background and + * reached this point with current u > U. + * + * In both cases, set a new period. */ - if (dl_se->dl_deadline == 0) - replenish_dl_new_period(dl_se, rq); + if (dl_se->dl_deadline == 0 || + (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; + } if (dl_se->dl_yielded && dl_se->runtime > 0) dl_se->runtime = 0; @@ -873,6 +899,44 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) dl_se->dl_yielded = 0; if (dl_se->dl_throttled) dl_se->dl_throttled = 0; + + /* + * If this is the replenishment of a deferred reservation, + * clear the flag and return. + */ + if (dl_se->dl_defer_armed) { + dl_se->dl_defer_armed = 0; + return; + } + + /* + * A this point, if the deferred server is not armed, and the deadline + * is in the future, if it is not running already, throttle the server + * and arm the defer timer. + */ + if (dl_se->dl_defer && !dl_se->dl_defer_running && + dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { + if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + + /* + * Set dl_se->dl_defer_armed and dl_throttled variables to + * inform the start_dl_timer() that this is a deferred + * activation. + */ + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + if (!start_dl_timer(dl_se)) { + /* + * If for whatever reason (delays), a previous timer was + * queued but not serviced, cancel it and clean the + * deferrable server variables intended for start_dl_timer(). + */ + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + } + } } /* @@ -1023,6 +1087,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) } replenish_dl_new_period(dl_se, rq); + } else if (dl_server(dl_se) && dl_se->dl_defer) { + /* + * The server can still use its previous deadline, so check if + * it left the dl_defer_running state. + */ + if (!dl_se->dl_defer_running) { + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + } } } @@ -1055,8 +1128,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. + * + * The deferred reservation will have its timer set to + * (deadline - runtime). At that point, the CBS rule will decide + * if the current deadline can be used, or if a replenishment is + * required to avoid add too much pressure on the system + * (current u > U). */ - act = ns_to_ktime(dl_next_period(dl_se)); + if (dl_se->dl_defer_armed) { + WARN_ON_ONCE(!dl_se->dl_throttled); + act = ns_to_ktime(dl_se->deadline - dl_se->runtime); + } else { + /* act = deadline - rel-deadline + period */ + act = ns_to_ktime(dl_next_period(dl_se)); + } + now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -1106,6 +1192,62 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) #endif } +/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ +static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; + +static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) +{ + struct rq *rq = rq_of_dl_se(dl_se); + u64 fw; + + scoped_guard (rq_lock, rq) { + struct rq_flags *rf = &scope.rf; + + if (!dl_se->dl_throttled || !dl_se->dl_runtime) + return HRTIMER_NORESTART; + + sched_clock_tick(); + update_rq_clock(rq); + + if (!dl_se->dl_runtime) + return HRTIMER_NORESTART; + + if (!dl_se->server_has_tasks(dl_se)) { + replenish_dl_entity(dl_se); + return HRTIMER_NORESTART; + } + + if (dl_se->dl_defer_armed) { + /* + * First check if the server could consume runtime in background. + * If so, it is possible to push the defer timer for this amount + * of time. The dl_server_min_res serves as a limit to avoid + * forwarding the timer for a too small amount of time. + */ + if (dl_time_before(rq_clock(dl_se->rq), + (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { + + /* reset the defer timer */ + fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; + + hrtimer_forward_now(timer, ns_to_ktime(fw)); + return HRTIMER_RESTART; + } + + dl_se->dl_defer_running = 1; + } + + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) + resched_curr(rq); + + __push_dl_task(rq, rf); + } + + return HRTIMER_NORESTART; +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -1128,28 +1270,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct rq_flags rf; struct rq *rq; - if (dl_server(dl_se)) { - struct rq *rq = rq_of_dl_se(dl_se); - struct rq_flags rf; - - rq_lock(rq, &rf); - if (dl_se->dl_throttled) { - sched_clock_tick(); - update_rq_clock(rq); - - if (dl_se->server_has_tasks(dl_se)) { - enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); - resched_curr(rq); - __push_dl_task(rq, &rf); - } else { - replenish_dl_entity(dl_se); - } - - } - rq_unlock(rq, &rf); - - return HRTIMER_NORESTART; - } + if (dl_server(dl_se)) + return dl_server_timer(timer, dl_se); p = dl_task_of(dl_se); rq = task_rq_lock(p, &rf); @@ -1319,22 +1441,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) return (delta * u_act) >> BW_SHIFT; } -static inline void -update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, - int flags); -static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { s64 scaled_delta_exec; - if (unlikely(delta_exec <= 0)) { - if (unlikely(dl_se->dl_yielded)) - goto throttle; - return; - } - - if (dl_entity_is_special(dl_se)) - return; - /* * For tasks that participate in GRUB, we implement GRUB-PA: the * spare reclaimed bandwidth is used to clock down frequency. @@ -1353,8 +1463,64 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); } + return scaled_delta_exec; +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +{ + s64 scaled_delta_exec; + + if (unlikely(delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; + return; + } + + if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) + return; + + if (dl_entity_is_special(dl_se)) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + dl_se->runtime -= scaled_delta_exec; + /* + * The fair server can consume its runtime while throttled (not queued/ + * running as regular CFS). + * + * If the server consumes its entire runtime in this state. The server + * is not required for the current period. Thus, reset the server by + * starting a new period, pushing the activation. + */ + if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { + /* + * If the server was previously activated - the starving condition + * took place, it this point it went away because the fair scheduler + * was able to get runtime in background. So return to the initial + * state. + */ + dl_se->dl_defer_running = 0; + + hrtimer_try_to_cancel(&dl_se->dl_timer); + + replenish_dl_new_period(dl_se, dl_se->rq); + + /* + * Not being able to start the timer seems problematic. If it could not + * be started for whatever reason, we need to "unthrottle" the DL server + * and queue right away. Otherwise nothing might queue it. That's similar + * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. + */ + WARN_ON_ONCE(!start_dl_timer(dl_se)); + + return; + } + throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; @@ -1382,6 +1548,14 @@ throttle: } /* + * The fair server (sole dl_server) does not account for real-time + * workload because it is running fair work. + */ + if (dl_se == &rq->fair_server) + return; + +#ifdef CONFIG_RT_GROUP_SCHED + /* * Because -- for now -- we share the rt bandwidth, we need to * account our runtime there too, otherwise actual rt tasks * would be able to exceed the shared quota. @@ -1405,34 +1579,155 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } +#endif +} + +/* + * In the non-defer mode, the idle time is not accounted, as the + * server provides a guarantee. + * + * If the dl_server is in defer mode, the idle time is also considered + * as time available for the fair server, avoiding a penalty for the + * rt scheduler that did not consumed that time. + */ +void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +{ + s64 delta_exec, scaled_delta_exec; + + if (!rq->fair_server.dl_defer) + return; + + /* no need to discount more */ + if (rq->fair_server.runtime < 0) + return; + + delta_exec = rq_clock_task(rq) - p->se.exec_start; + if (delta_exec < 0) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); + + rq->fair_server.runtime -= scaled_delta_exec; + + if (rq->fair_server.runtime < 0) { + rq->fair_server.dl_defer_running = 0; + rq->fair_server.runtime = 0; + } + + p->se.exec_start = rq_clock_task(rq); } void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { - update_curr_dl_se(dl_se->rq, dl_se, delta_exec); + /* 0 runtime = fair server disabled */ + if (dl_se->dl_runtime) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } void dl_server_start(struct sched_dl_entity *dl_se) { + struct rq *rq = dl_se->rq; + + /* + * XXX: the apply do not work fine at the init phase for the + * fair server because things are not yet set. We need to improve + * this before getting generic. + */ if (!dl_server(dl_se)) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + dl_server_apply_params(dl_se, runtime, period, 1); + dl_se->dl_server = 1; + dl_se->dl_defer = 1; setup_new_dl_entity(dl_se); } + + if (!dl_se->dl_runtime) + return; + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) + resched_curr(dl_se->rq); } void dl_server_stop(struct sched_dl_entity *dl_se) { + if (!dl_se->dl_runtime) + return; + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, - dl_server_pick_f pick) + dl_server_pick_f pick_task) { dl_se->rq = rq; dl_se->server_has_tasks = has_tasks; - dl_se->server_pick = pick; + dl_se->server_pick_task = pick_task; +} + +void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 new_bw = dl_se->dl_bw; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + + dl_b = dl_bw_of(cpu_of(rq)); + guard(raw_spinlock)(&dl_b->lock); + + if (!dl_bw_cpus(cpu)) + return; + + __dl_add(dl_b, new_bw, dl_bw_cpus(cpu)); +} + +int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) +{ + u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); + u64 new_bw = to_ratio(period, runtime); + struct rq *rq = dl_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + unsigned long cap; + int retval = 0; + int cpus; + + dl_b = dl_bw_of(cpu); + guard(raw_spinlock)(&dl_b->lock); + + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + + if (__dl_overflow(dl_b, cap, old_bw, new_bw)) + return -EBUSY; + + if (init) { + __add_rq_bw(new_bw, &rq->dl); + __dl_add(dl_b, new_bw, cpus); + } else { + __dl_sub(dl_b, dl_se->dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + + dl_rq_change_utilization(rq, dl_se, new_bw); + } + + dl_se->dl_runtime = runtime; + dl_se->dl_deadline = period; + dl_se->dl_period = period; + + dl_se->runtime = 0; + dl_se->deadline = 0; + + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); + + return retval; } /* @@ -1599,46 +1894,40 @@ static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); } -static inline struct sched_statistics * +static __always_inline struct sched_statistics * __schedstats_from_dl_se(struct sched_dl_entity *dl_se) { + if (!schedstat_enabled()) + return NULL; + + if (dl_server(dl_se)) + return NULL; + return &dl_task_of(dl_se)->stats; } static inline void update_stats_wait_start_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) { - struct sched_statistics *stats; - - if (!schedstat_enabled()) - return; - - stats = __schedstats_from_dl_se(dl_se); - __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_wait_start(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); } static inline void update_stats_wait_end_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) { - struct sched_statistics *stats; - - if (!schedstat_enabled()) - return; - - stats = __schedstats_from_dl_se(dl_se); - __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_wait_end(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); } static inline void update_stats_enqueue_sleeper_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se) { - struct sched_statistics *stats; - - if (!schedstat_enabled()) - return; - - stats = __schedstats_from_dl_se(dl_se); - __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); + struct sched_statistics *stats = __schedstats_from_dl_se(dl_se); + if (stats) + __update_stats_enqueue_sleeper(rq_of_dl_rq(dl_rq), dl_task_of(dl_se), stats); } static inline void @@ -1735,7 +2024,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) * be counted in the active utilization; hence, we need to call * add_running_bw(). */ - if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { if (flags & ENQUEUE_WAKEUP) task_contending(dl_se, flags); @@ -1757,6 +2046,25 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) setup_new_dl_entity(dl_se); } + /* + * If the reservation is still throttled, e.g., it got replenished but is a + * deferred task and still got to wait, don't enqueue. + */ + if (dl_se->dl_throttled && start_dl_timer(dl_se)) + return; + + /* + * We're about to enqueue, make sure we're not ->dl_throttled! + * In case the timer was not started, say because the defer time + * has passed, mark as not throttled and mark unarmed. + * Also cancel earlier timers, since letting those run is pointless. + */ + if (dl_se->dl_throttled) { + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + __enqueue_dl_entity(dl_se); } @@ -1846,7 +2154,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) enqueue_pushable_dl_task(rq, p); } -static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) +static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) { update_curr_dl(rq); @@ -1856,6 +2164,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) dequeue_dl_entity(&p->dl, flags); if (!p->dl.dl_throttled && !dl_server(&p->dl)) dequeue_pushable_dl_task(rq, p); + + return true; } /* @@ -2074,6 +2384,9 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); deadline_queue_push_tasks(rq); + + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, &p->dl); } static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) @@ -2086,7 +2399,11 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) return __node_2_dle(left); } -static struct task_struct *pick_task_dl(struct rq *rq) +/* + * __pick_next_task_dl - Helper to pick the next -deadline task to run. + * @rq: The runqueue to pick the next task from. + */ +static struct task_struct *__pick_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2100,14 +2417,13 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick(dl_se); + p = dl_se->server_pick_task(dl_se); if (!p) { - WARN_ON_ONCE(1); dl_se->dl_yielded = 1; update_curr_dl_se(rq, dl_se, 0); goto again; } - p->dl_server = dl_se; + rq->dl_server = dl_se; } else { p = dl_task_of(dl_se); } @@ -2115,24 +2431,12 @@ again: return p; } -static struct task_struct *pick_next_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq) { - struct task_struct *p; - - p = pick_task_dl(rq); - if (!p) - return p; - - if (!p->dl_server) - set_next_task_dl(rq, p, true); - - if (hrtick_enabled(rq)) - start_hrtick_dl(rq, &p->dl); - - return p; + return __pick_task_dl(rq); } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) { struct sched_dl_entity *dl_se = &p->dl; struct dl_rq *dl_rq = &rq->dl; @@ -2824,13 +3128,12 @@ DEFINE_SCHED_CLASS(dl) = { .wakeup_preempt = wakeup_preempt_dl, - .pick_next_task = pick_next_task_dl, + .pick_task = pick_task_dl, .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP .balance = balance_dl, - .pick_task = pick_task_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c1eb9a1afd13..de1dc5264b3f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = { .release = seq_release, }; +enum dl_param { + DL_RUNTIME = 0, + DL_PERIOD, +}; + +static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ +static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ + +static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, enum dl_param param) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + u64 runtime, period; + size_t err; + int retval; + u64 value; + + err = kstrtoull_from_user(ubuf, cnt, 10, &value); + if (err) + return err; + + scoped_guard (rq_lock_irqsave, rq) { + runtime = rq->fair_server.dl_runtime; + period = rq->fair_server.dl_period; + + switch (param) { + case DL_RUNTIME: + if (runtime == value) + break; + runtime = value; + break; + case DL_PERIOD: + if (value == period) + break; + period = value; + break; + } + + if (runtime > period || + period > fair_server_period_max || + period < fair_server_period_min) { + return -EINVAL; + } + + if (rq->cfs.h_nr_running) { + update_rq_clock(rq); + dl_server_stop(&rq->fair_server); + } + + retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); + if (retval) + cnt = retval; + + if (!runtime) + printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", + cpu_of(rq)); + + if (rq->cfs.h_nr_running) + dl_server_start(&rq->fair_server); + } + + *ppos += cnt; + return cnt; +} + +static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + u64 value; + + switch (param) { + case DL_RUNTIME: + value = rq->fair_server.dl_runtime; + break; + case DL_PERIOD: + value = rq->fair_server.dl_period; + break; + } + + seq_printf(m, "%llu\n", value); + return 0; + +} + +static ssize_t +sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); +} + +static int sched_fair_server_runtime_show(struct seq_file *m, void *v) +{ + return sched_fair_server_show(m, v, DL_RUNTIME); +} + +static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_fair_server_runtime_show, inode->i_private); +} + +static const struct file_operations fair_server_runtime_fops = { + .open = sched_fair_server_runtime_open, + .write = sched_fair_server_runtime_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static ssize_t +sched_fair_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); +} + +static int sched_fair_server_period_show(struct seq_file *m, void *v) +{ + return sched_fair_server_show(m, v, DL_PERIOD); +} + +static int sched_fair_server_period_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_fair_server_period_show, inode->i_private); +} + +static const struct file_operations fair_server_period_fops = { + .open = sched_fair_server_period_open, + .write = sched_fair_server_period_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static struct dentry *debugfs_sched; +static void debugfs_fair_server_init(void) +{ + struct dentry *d_fair; + unsigned long cpu; + + d_fair = debugfs_create_dir("fair_server", debugfs_sched); + if (!d_fair) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu = debugfs_create_dir(buf, d_fair); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops); + } +} + static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa; @@ -374,6 +531,8 @@ static __init int sched_init_debug(void) debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + debugfs_fair_server_init(); + return 0; } late_initcall(sched_init_debug); @@ -580,27 +739,27 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', SPLIT_NS(p->se.deadline), + p->se.custom_slice ? 'S' : ' ', SPLIT_NS(p->se.slice), SPLIT_NS(p->se.sum_exec_runtime), (long long)(p->nvcsw + p->nivcsw), p->prio); - SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", + SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld", SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), - SPLIT_NS(p->se.sum_exec_runtime), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); #ifdef CONFIG_NUMA_BALANCING - SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif #ifdef CONFIG_CGROUP_SCHED - SEQ_printf_task_group_path(m, task_group(p), " %s") + SEQ_printf_task_group_path(m, task_group(p), " %s") #endif SEQ_printf(m, "\n"); @@ -612,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" - " wait-time sum-exec sum-sleep\n"); + SEQ_printf(m, " S task PID vruntime eligible " + "deadline slice sum-exec switches " + "prio wait-time sum-sleep sum-block" +#ifdef CONFIG_NUMA_BALANCING + " node group-id" +#endif +#ifdef CONFIG_CGROUP_SCHED + " group-path" +#endif + "\n"); SEQ_printf(m, "-------------------------------------------------------" - "------------------------------------------------------\n"); + "------------------------------------------------------" + "------------------------------------------------------" +#ifdef CONFIG_NUMA_BALANCING + "--------------" +#endif +#ifdef CONFIG_CGROUP_SCHED + "--------------" +#endif + "\n"); rcu_read_lock(); for_each_process_thread(g, p) { @@ -641,8 +816,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, "\n"); SEQ_printf(m, "cfs_rq[%d]:\n", cpu); #endif - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", - SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); root = __pick_root_entity(cfs_rq); @@ -669,8 +842,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", - cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", @@ -730,9 +901,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) PU(rt_nr_running); + +#ifdef CONFIG_RT_GROUP_SCHED P(rt_throttled); PN(rt_time); PN(rt_runtime); +#endif #undef PN #undef PU diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9057584ec06d..03d7c5e59a18 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -511,7 +511,7 @@ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) static int se_is_idle(struct sched_entity *se) { - return 0; + return task_has_idle_policy(task_of(se)); } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -779,8 +779,22 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } /* ensure we never gain time by being placed backwards. */ - u64_u32_store(cfs_rq->min_vruntime, - __update_min_vruntime(cfs_rq, vruntime)); + cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); +} + +static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) +{ + struct sched_entity *root = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 min_slice = ~0ULL; + + if (curr && curr->on_rq) + min_slice = curr->slice; + + if (root) + min_slice = min(min_slice, root->min_slice); + + return min_slice; } static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) @@ -799,19 +813,34 @@ static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node } } +static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (rse->min_slice < se->min_slice) + se->min_slice = rse->min_slice; + } +} + /* * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { u64 old_min_vruntime = se->min_vruntime; + u64 old_min_slice = se->min_slice; struct rb_node *node = &se->run_node; se->min_vruntime = se->vruntime; __min_vruntime_update(se, node->rb_right); __min_vruntime_update(se, node->rb_left); - return se->min_vruntime == old_min_vruntime; + se->min_slice = se->slice; + __min_slice_update(se, node->rb_right); + __min_slice_update(se, node->rb_left); + + return se->min_vruntime == old_min_vruntime && + se->min_slice == old_min_slice; } RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, @@ -824,6 +853,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); se->min_vruntime = se->vruntime; + se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less, &min_vruntime_cb); } @@ -974,17 +1004,18 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i * this is probably good enough. */ -static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) +static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { if ((s64)(se->vruntime - se->deadline) < 0) - return; + return false; /* * For EEVDF the virtual time slope is determined by w_i (iow. * nice) while the request time r_i is determined by * sysctl_sched_base_slice. */ - se->slice = sysctl_sched_base_slice; + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice; /* * EEVDF: vd_i = ve_i + r_i / w_i @@ -994,10 +1025,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * The task has consumed its request, reschedule. */ - if (cfs_rq->nr_running > 1) { - resched_curr(rq_of(cfs_rq)); - clear_buddies(cfs_rq, se); - } + return true; } #include "pelt.h" @@ -1135,6 +1163,38 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) dl_server_update(p->dl_server, delta_exec); } +static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + if (!sched_feat(PREEMPT_SHORT)) + return false; + + if (curr->vlag == curr->deadline) + return false; + + return !entity_eligible(cfs_rq, curr); +} + +static inline bool do_preempt_short(struct cfs_rq *cfs_rq, + struct sched_entity *pse, struct sched_entity *se) +{ + if (!sched_feat(PREEMPT_SHORT)) + return false; + + if (pse->slice >= se->slice) + return false; + + if (!entity_eligible(cfs_rq, pse)) + return false; + + if (entity_before(pse, se)) + return true; + + if (!entity_eligible(cfs_rq, se)) + return true; + + return false; +} + /* * Used by other classes to account runtime. */ @@ -1156,23 +1216,44 @@ s64 update_curr_common(struct rq *rq) static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; + struct rq *rq = rq_of(cfs_rq); s64 delta_exec; + bool resched; if (unlikely(!curr)) return; - delta_exec = update_curr_se(rq_of(cfs_rq), curr); + delta_exec = update_curr_se(rq, curr); if (unlikely(delta_exec <= 0)) return; curr->vruntime += calc_delta_fair(delta_exec, curr); - update_deadline(cfs_rq, curr); + resched = update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); - if (entity_is_task(curr)) - update_curr_task(task_of(curr), delta_exec); + if (entity_is_task(curr)) { + struct task_struct *p = task_of(curr); + + update_curr_task(p, delta_exec); + + /* + * Any fair task that runs outside of fair_server should + * account against fair_server such that it can account for + * this time and possibly avoid running this period. + */ + if (p->dl_server != &rq->fair_server) + dl_server_update(&rq->fair_server, delta_exec); + } account_cfs_rq_runtime(cfs_rq, delta_exec); + + if (rq->nr_running == 1) + return; + + if (resched || did_preempt_short(cfs_rq, curr)) { + resched_curr(rq); + clear_buddies(cfs_rq, curr); + } } static void update_curr_fair(struct rq *rq) @@ -1742,7 +1823,7 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat) continue; if (zone_watermark_ok(zone, 0, - wmark_pages(zone, WMARK_PROMO) + enough_wmark, + promo_wmark_pages(zone) + enough_wmark, ZONE_MOVABLE, 0)) return true; } @@ -1840,8 +1921,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, * The pages in slow memory node should be migrated according * to hot/cold instead of private/shared. */ - if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && - !node_is_toptier(src_nid)) { + if (folio_use_access_time(folio)) { struct pglist_data *pgdat; unsigned long rate_limit; unsigned int latency, th, def_th; @@ -3188,6 +3268,15 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) return true; } + /* + * This vma has not been accessed for a while, and if the number + * the threads in the same process is low, which means no other + * threads can help scan this vma, force a vma scan. + */ + if (READ_ONCE(mm->numa_scan_seq) > + (vma->numab_state->prev_scan_seq + get_nr_threads(current))) + return true; + return false; } @@ -5178,7 +5267,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) u64 vslice, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; - se->slice = sysctl_sched_base_slice; + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice; vslice = calc_delta_fair(se->slice, se); /* @@ -5259,6 +5349,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->vruntime = vruntime - lag; + if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { + se->deadline += se->vruntime; + se->rel_deadline = 0; + return; + } + /* * When joining the competition; the existing tasks will be, * on average, halfway through their slice, as such start tasks @@ -5279,6 +5375,9 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static inline bool cfs_bandwidth_used(void); static void +requeue_delayed_entity(struct sched_entity *se); + +static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { bool curr = cfs_rq->curr == se; @@ -5365,20 +5464,48 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); -static void +static inline void finish_delayed_dequeue_entity(struct sched_entity *se) +{ + se->sched_delayed = 0; + if (sched_feat(DELAY_ZERO) && se->vlag > 0) + se->vlag = 0; +} + +static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - int action = UPDATE_TG; + bool sleep = flags & DEQUEUE_SLEEP; + update_curr(cfs_rq); + + if (flags & DEQUEUE_DELAYED) { + SCHED_WARN_ON(!se->sched_delayed); + } else { + bool delay = sleep; + /* + * DELAY_DEQUEUE relies on spurious wakeups, special task + * states must not suffer spurious wakeups, excempt them. + */ + if (flags & DEQUEUE_SPECIAL) + delay = false; + + SCHED_WARN_ON(delay && se->sched_delayed); + + if (sched_feat(DELAY_DEQUEUE) && delay && + !entity_eligible(cfs_rq, se)) { + if (cfs_rq->next == se) + cfs_rq->next = NULL; + update_load_avg(cfs_rq, se, 0); + se->sched_delayed = 1; + return false; + } + } + + int action = UPDATE_TG; if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) action |= DO_DETACH; /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new @@ -5395,6 +5522,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) clear_buddies(cfs_rq, se); update_entity_lag(cfs_rq, se); + if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { + se->deadline -= se->vruntime; + se->rel_deadline = 1; + } + if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; @@ -5414,8 +5546,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); + if (flags & DEQUEUE_DELAYED) + finish_delayed_dequeue_entity(se); + if (cfs_rq->nr_running == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); + + return true; } static void @@ -5441,6 +5578,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) } update_stats_curr_start(cfs_rq, se); + SCHED_WARN_ON(cfs_rq->curr); cfs_rq->curr = se; /* @@ -5461,6 +5599,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); + /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups @@ -5469,16 +5609,26 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct cfs_rq *cfs_rq) +pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { /* * Enabling NEXT_BUDDY will affect latency but not fairness. */ if (sched_feat(NEXT_BUDDY) && - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { + /* ->next will never be delayed */ + SCHED_WARN_ON(cfs_rq->next->sched_delayed); return cfs_rq->next; + } - return pick_eevdf(cfs_rq); + struct sched_entity *se = pick_eevdf(cfs_rq); + if (se->sched_delayed) { + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + SCHED_WARN_ON(se->sched_delayed); + SCHED_WARN_ON(se->on_rq); + return NULL; + } + return se; } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -5502,6 +5652,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } + SCHED_WARN_ON(cfs_rq->curr != prev); cfs_rq->curr = NULL; } @@ -5765,6 +5916,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; + long rq_h_nr_running = rq->cfs.h_nr_running; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5798,11 +5950,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); + int flags; + /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) goto done; - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + /* + * Abuse SPECIAL to avoid delayed dequeue in this instance. + * This avoids teaching dequeue_entities() about throttled + * entities and keeps things relatively simple. + */ + flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; + if (se->sched_delayed) + flags |= DEQUEUE_DELAYED; + dequeue_entity(qcfs_rq, se, flags); if (cfs_rq_is_idle(group_cfs_rq(se))) idle_task_delta = cfs_rq->h_nr_running; @@ -5836,6 +5998,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, task_delta); + /* Stop the fair server if throttling resulted in no runnable tasks */ + if (rq_h_nr_running && !rq->cfs.h_nr_running) + dl_server_stop(&rq->fair_server); done: /* * Note: distribution will already see us throttled via the @@ -5854,6 +6019,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; + long rq_h_nr_running = rq->cfs.h_nr_running; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5891,8 +6057,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); - if (se->on_rq) + if (se->on_rq) { + SCHED_WARN_ON(se->sched_delayed); break; + } enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) @@ -5923,6 +6091,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) goto unthrottle_throttle; } + /* Start the fair server if un-throttling resulted in new runnable tasks */ + if (!rq_h_nr_running && rq->cfs.h_nr_running) + dl_server_start(&rq->fair_server); + /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); @@ -6555,7 +6727,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) { int cpu = cpu_of(rq); - if (!sched_feat(HZ_BW) || !cfs_bandwidth_used()) + if (!cfs_bandwidth_used()) return; if (!tick_nohz_full_cpu(cpu)) @@ -6738,6 +6910,37 @@ static int sched_idle_cpu(int cpu) } #endif +static void +requeue_delayed_entity(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * se->sched_delayed should imply: se->on_rq == 1. + * Because a delayed entity is one that is still on + * the runqueue competing until elegibility. + */ + SCHED_WARN_ON(!se->sched_delayed); + SCHED_WARN_ON(!se->on_rq); + + if (sched_feat(DELAY_ZERO)) { + update_entity_lag(cfs_rq, se); + if (se->vlag > 0) { + cfs_rq->nr_running--; + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->vlag = 0; + place_entity(cfs_rq, se, 0); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + cfs_rq->nr_running++; + } + } + + update_load_avg(cfs_rq, se, 0); + se->sched_delayed = 0; +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -6750,6 +6953,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); int task_new = !(flags & ENQUEUE_WAKEUP); + int rq_h_nr_running = rq->cfs.h_nr_running; + u64 slice = 0; /* * The code below (indirectly) updates schedutil which looks at @@ -6757,7 +6962,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil. */ - util_est_enqueue(&rq->cfs, p); + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) + util_est_enqueue(&rq->cfs, p); + + if (flags & ENQUEUE_DELAYED) { + requeue_delayed_entity(se); + return; + } /* * If in_iowait is set, the code below may not trigger any cpufreq @@ -6768,10 +6979,24 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); for_each_sched_entity(se) { - if (se->on_rq) + if (se->on_rq) { + if (se->sched_delayed) + requeue_delayed_entity(se); break; + } cfs_rq = cfs_rq_of(se); + + /* + * Basically set the slice of group entries to the min_slice of + * their respective cfs_rq. This ensures the group can service + * its entities in the desired time-frame. + */ + if (slice) { + se->slice = slice; + se->custom_slice = 1; + } enqueue_entity(cfs_rq, se, flags); + slice = cfs_rq_min_slice(cfs_rq); cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; @@ -6793,6 +7018,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) se_update_runnable(se); update_cfs_group(se); + se->slice = slice; + slice = cfs_rq_min_slice(cfs_rq); + cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; @@ -6804,6 +7032,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) goto enqueue_throttle; } + if (!rq_h_nr_running && rq->cfs.h_nr_running) { + /* Account for idle runtime */ + if (!rq->nr_running) + dl_server_update_idle_time(rq, rq->curr); + dl_server_start(&rq->fair_server); + } + /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); @@ -6833,36 +7068,59 @@ enqueue_throttle: static void set_next_buddy(struct sched_entity *se); /* - * The dequeue_task method is called before nr_running is - * decreased. We remove the task from the rbtree and - * update the fair scheduling stats: + * Basically dequeue_task_fair(), except it can deal with dequeue_entity() + * failing half-way through and resume the dequeue later. + * + * Returns: + * -1 - dequeue delayed + * 0 - dequeue throttled + * 1 - dequeue complete */ -static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { - struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se; - int task_sleep = flags & DEQUEUE_SLEEP; - int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + int rq_h_nr_running = rq->cfs.h_nr_running; + bool task_sleep = flags & DEQUEUE_SLEEP; + bool task_delayed = flags & DEQUEUE_DELAYED; + struct task_struct *p = NULL; + int idle_h_nr_running = 0; + int h_nr_running = 0; + struct cfs_rq *cfs_rq; + u64 slice = 0; - util_est_dequeue(&rq->cfs, p); + if (entity_is_task(se)) { + p = task_of(se); + h_nr_running = 1; + idle_h_nr_running = task_has_idle_policy(p); + } else { + cfs_rq = group_cfs_rq(se); + slice = cfs_rq_min_slice(cfs_rq); + } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, flags); - cfs_rq->h_nr_running--; + if (!dequeue_entity(cfs_rq, se, flags)) { + if (p && &p->se == se) + return -1; + + break; + } + + cfs_rq->h_nr_running -= h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + idle_h_nr_running = h_nr_running; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; + return 0; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + slice = cfs_rq_min_slice(cfs_rq); + /* Avoid re-evaluating load for this entity: */ se = parent_entity(se); /* @@ -6874,6 +7132,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; } flags |= DEQUEUE_SLEEP; + flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); } for_each_sched_entity(se) { @@ -6883,28 +7142,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) se_update_runnable(se); update_cfs_group(se); - cfs_rq->h_nr_running--; + se->slice = slice; + slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_running -= h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + idle_h_nr_running = h_nr_running; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - goto dequeue_throttle; - + return 0; } - /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, 1); + sub_nr_running(rq, h_nr_running); + + if (rq_h_nr_running && !rq->cfs.h_nr_running) + dl_server_stop(&rq->fair_server); /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; -dequeue_throttle: - util_est_update(&rq->cfs, p, task_sleep); + if (p && task_delayed) { + SCHED_WARN_ON(!task_sleep); + SCHED_WARN_ON(p->on_rq != 1); + + /* Fix-up what dequeue_task_fair() skipped */ + hrtick_update(rq); + + /* Fix-up what block_task() skipped. */ + __block_task(rq, p); + } + + return 1; +} + +/* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and + * update the fair scheduling stats: + */ +static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ + if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) + util_est_dequeue(&rq->cfs, p); + + if (dequeue_entities(rq, &p->se, flags) < 0) { + util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); + return false; + } + + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); hrtick_update(rq); + return true; } #ifdef CONFIG_SMP @@ -7803,6 +8095,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) } /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the IRQ utilization. + * + * The DL bandwidth number OTOH is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long *min, + unsigned long *max) +{ + unsigned long util, irq, scale; + struct rq *rq = cpu_rq(cpu); + + scale = arch_scale_cpu_capacity(cpu); + + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ + irq = cpu_util_irq(rq); + if (unlikely(irq >= scale)) { + if (min) + *min = scale; + if (max) + *max = scale; + return scale; + } + + if (min) { + /* + * The minimum utilization returns the highest level between: + * - the computed DL bandwidth needed with the IRQ pressure which + * steals time to the deadline task. + * - The minimum performance requirement for CFS and/or RT. + */ + *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); + + /* + * When an RT task is runnable and uclamp is not used, we must + * ensure that the task will run at maximum compute capacity. + */ + if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) + *min = max(*min, scale); + } + + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + */ + util = util_cfs + cpu_util_rt(rq); + util += cpu_util_dl(rq); + + /* + * The maximum hint is a soft bandwidth requirement, which can be lower + * than the actual utilization because of uclamp_max requirements. + */ + if (max) + *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); + + if (util >= scale) + return scale; + + /* + * There is still idle time; further improve the number by using the + * IRQ metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * max - irq + * U' = irq + --------- * U + * max + */ + util = scale_irq_capacity(util, irq, scale); + util += irq; + + return min(scale, util); +} + +unsigned long sched_cpu_util(int cpu) +{ + return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); +} + +/* * energy_env - Utilization landscape for energy estimation. * @task_busy_time: Utilization contribution by the task for which we test the * placement. Given by eenv_task_busy_time(). @@ -8286,7 +8677,21 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) static void task_dead_fair(struct task_struct *p) { - remove_entity_load_avg(&p->se); + struct sched_entity *se = &p->se; + + if (se->sched_delayed) { + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(p, &rf); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + task_rq_unlock(rq, p, &rf); + } + + remove_entity_load_avg(se); } /* @@ -8322,7 +8727,7 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context static int balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - if (rq->nr_running) + if (sched_fair_runnable(rq)) return 1; return sched_balance_newidle(rq, rf) != 0; @@ -8381,16 +8786,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (test_tsk_need_resched(curr)) return; - /* Idle tasks are by definition preempted by non-idle tasks. */ - if (unlikely(task_has_idle_policy(curr)) && - likely(!task_has_idle_policy(p))) - goto preempt; - - /* - * Batch and idle tasks do not preempt non-idle tasks (their preemption - * is driven by the tick): - */ - if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) + if (!sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); @@ -8400,7 +8796,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int pse_is_idle = se_is_idle(pse); /* - * Preempt an idle group in favor of a non-idle group (and don't preempt + * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case). */ if (cse_is_idle && !pse_is_idle) @@ -8408,11 +8804,26 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (cse_is_idle != pse_is_idle) return; + /* + * BATCH and IDLE tasks do not preempt others. + */ + if (unlikely(p->policy != SCHED_NORMAL)) + return; + cfs_rq = cfs_rq_of(se); update_curr(cfs_rq); + /* + * If @p has a shorter slice than current and @p is eligible, override + * current's slice protection in order to allow preemption. + * + * Note that even if @p does not turn out to be the most eligible + * task at this moment, current's slice protection will be lost. + */ + if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) + se->vlag = se->deadline + 1; /* - * XXX pick_eevdf(cfs_rq) != se ? + * If @p has become the most eligible task, force preemption. */ if (pick_eevdf(cfs_rq) == pse) goto preempt; @@ -8423,7 +8834,6 @@ preempt: resched_curr(rq); } -#ifdef CONFIG_SMP static struct task_struct *pick_task_fair(struct rq *rq) { struct sched_entity *se; @@ -8435,95 +8845,58 @@ again: return NULL; do { - struct sched_entity *curr = cfs_rq->curr; + /* Might not have done put_prev_entity() */ + if (cfs_rq->curr && cfs_rq->curr->on_rq) + update_curr(cfs_rq); - /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ - if (curr) { - if (curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; - - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto again; - } + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto again; - se = pick_next_entity(cfs_rq); + se = pick_next_entity(rq, cfs_rq); + if (!se) + goto again; cfs_rq = group_cfs_rq(se); } while (cfs_rq); return task_of(se); } -#endif + +static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; struct task_struct *p; int new_tasks; again: - if (!sched_fair_runnable(rq)) + p = pick_task_fair(rq); + if (!p) goto idle; + se = &p->se; #ifdef CONFIG_FAIR_GROUP_SCHED - if (!prev || prev->sched_class != &fair_sched_class) + if (prev->sched_class != &fair_sched_class) goto simple; + __put_prev_set_next_dl_server(rq, prev, p); + /* * Because of the set_next_buddy() in dequeue_task_fair() it is rather * likely that a next task is from the same cgroup as the current. * * Therefore attempt to avoid putting and setting the entire cgroup * hierarchy, only change the part that actually changes. - */ - - do { - struct sched_entity *curr = cfs_rq->curr; - - /* - * Since we got here without doing put_prev_entity() we also - * have to consider cfs_rq->curr. If it is still a runnable - * entity, update_curr() will update its vruntime, otherwise - * forget we've ever seen it. - */ - if (curr) { - if (curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; - - /* - * This call to check_cfs_rq_runtime() will do the - * throttle and dequeue its entity in the parent(s). - * Therefore the nr_running test will indeed - * be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) { - cfs_rq = &rq->cfs; - - if (!cfs_rq->nr_running) - goto idle; - - goto simple; - } - } - - se = pick_next_entity(cfs_rq); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - - p = task_of(se); - - /* + * * Since we haven't yet done put_prev_entity and if the selected task * is a different task than we started out with, try and touch the * least amount of cfs_rqs. */ if (prev != p) { struct sched_entity *pse = &prev->se; + struct cfs_rq *cfs_rq; while (!(cfs_rq = is_same_group(se, pse))) { int se_depth = se->depth; @@ -8541,38 +8914,15 @@ again: put_prev_entity(cfs_rq, pse); set_next_entity(cfs_rq, se); - } - - goto done; -simple: -#endif - if (prev) - put_prev_task(rq, prev); - do { - se = pick_next_entity(cfs_rq); - set_next_entity(cfs_rq, se); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); + __set_next_task_fair(rq, p, true); + } - p = task_of(se); + return p; -done: __maybe_unused; -#ifdef CONFIG_SMP - /* - * Move the next running task to the front of - * the list, so our cfs_tasks list becomes MRU - * one. - */ - list_move(&p->se.group_node, &rq->cfs_tasks); +simple: #endif - - if (hrtick_enabled_fair(rq)) - hrtick_start_fair(rq, p); - - update_misfit_status(p, rq); - sched_fair_update_stop_tick(rq, p); - + put_prev_set_next_task(rq, prev, p); return p; idle: @@ -8601,15 +8951,34 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq) +static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) { - return pick_next_task_fair(rq, NULL, NULL); + return pick_next_task_fair(rq, prev, NULL); +} + +static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) +{ + return !!dl_se->rq->cfs.nr_running; +} + +static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) +{ + return pick_task_fair(dl_se->rq); +} + +void fair_server_init(struct rq *rq) +{ + struct sched_dl_entity *dl_se = &rq->fair_server; + + init_dl_entity(dl_se); + + dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); } /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; @@ -9360,9 +9729,10 @@ static bool __update_blocked_others(struct rq *rq, bool *done) hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + /* hw_pressure doesn't care about invariance */ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | - update_hw_load_avg(now, rq, hw_pressure) | + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) | update_irq_load_avg(rq, 0); if (others_have_blocked(rq)) @@ -12483,7 +12853,7 @@ out: * - indirectly from a remote scheduler_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ -static __latent_entropy void sched_balance_softirq(struct softirq_action *h) +static __latent_entropy void sched_balance_softirq(void) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance; @@ -12702,22 +13072,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { - struct sched_entity *se = &p->se, *curr; - struct cfs_rq *cfs_rq; - struct rq *rq = this_rq(); - struct rq_flags rf; - - rq_lock(rq, &rf); - update_rq_clock(rq); - set_task_max_allowed_capacity(p); - - cfs_rq = task_cfs_rq(current); - curr = cfs_rq->curr; - if (curr) - update_curr(cfs_rq); - place_entity(cfs_rq, se, ENQUEUE_INITIAL); - rq_unlock(rq, &rf); } /* @@ -12829,10 +13184,28 @@ static void attach_task_cfs_rq(struct task_struct *p) static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); + /* + * Since this is called after changing class, this is a little weird + * and we cannot use DEQUEUE_DELAYED. + */ + if (p->se.sched_delayed) { + /* First, dequeue it from its new class' structures */ + dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); + /* + * Now, clean up the fair_sched_class side of things + * related to sched_delayed being true and that wasn't done + * due to the generic dequeue not using DEQUEUE_DELAYED. + */ + finish_delayed_dequeue_entity(&p->se); + p->se.rel_deadline = 0; + __block_task(rq, p); + } } static void switched_to_fair(struct rq *rq, struct task_struct *p) { + SCHED_WARN_ON(p->se.sched_delayed); + attach_task_cfs_rq(p); set_task_max_allowed_capacity(p); @@ -12850,12 +13223,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) } } -/* Account for a task changing its policy or group. - * - * This routine is mostly called to set cfs_rq->curr field when a task - * migrates between groups/classes. - */ -static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { struct sched_entity *se = &p->se; @@ -12868,6 +13236,27 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) list_move(&se->group_node, &rq->cfs_tasks); } #endif + if (!first) + return; + + SCHED_WARN_ON(se->sched_delayed); + + if (hrtick_enabled_fair(rq)) + hrtick_start_fair(rq, p); + + update_misfit_status(p, rq); + sched_fair_update_stop_tick(rq, p); +} + +/* + * Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +{ + struct sched_entity *se = &p->se; for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -12876,12 +13265,14 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); } + + __set_next_task_fair(rq, p, first); } void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; - u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif @@ -12983,28 +13374,35 @@ void online_fair_sched_group(struct task_group *tg) void unregister_fair_sched_group(struct task_group *tg) { - unsigned long flags; - struct rq *rq; int cpu; destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(cpu) { - if (tg->se[cpu]) - remove_entity_load_avg(tg->se[cpu]); + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + struct sched_entity *se = tg->se[cpu]; + struct rq *rq = cpu_rq(cpu); + + if (se) { + if (se->sched_delayed) { + guard(rq_lock_irqsave)(rq); + if (se->sched_delayed) { + update_rq_clock(rq); + dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + } + list_del_leaf_cfs_rq(cfs_rq); + } + remove_entity_load_avg(se); + } /* * Only empty task groups can be destroyed; so we can speculatively * check on_list without danger of it being re-added. */ - if (!tg->cfs_rq[cpu]->on_list) - continue; - - rq = cpu_rq(cpu); - - raw_spin_rq_lock_irqsave(rq, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_rq_unlock_irqrestore(rq, flags); + if (cfs_rq->on_list) { + guard(rq_lock_irqsave)(rq); + list_del_leaf_cfs_rq(cfs_rq); + } } } @@ -13194,13 +13592,13 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = check_preempt_wakeup_fair, + .pick_task = pick_task_fair, .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP .balance = balance_fair, - .pick_task = pick_task_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 143f55df890b..290874079f60 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -5,8 +5,24 @@ * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. */ SCHED_FEAT(PLACE_LAG, true) +/* + * Give new tasks half a slice to ease into the competition. + */ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) +/* + * Preserve relative virtual deadline on 'migration'. + */ +SCHED_FEAT(PLACE_REL_DEADLINE, true) +/* + * Inhibit (wakeup) preemption until the current task has either matched the + * 0-lag point or until is has exhausted it's slice. + */ SCHED_FEAT(RUN_TO_PARITY, true) +/* + * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for + * current. + */ +SCHED_FEAT(PREEMPT_SHORT, true) /* * Prefer to schedule the task we woke last (assuming it failed @@ -22,6 +38,18 @@ SCHED_FEAT(NEXT_BUDDY, false) SCHED_FEAT(CACHE_HOT_BUDDY, true) /* + * Delay dequeueing tasks until they get selected or woken. + * + * By delaying the dequeue for non-eligible tasks, they remain in the + * competition and can burn off their negative lag. When they get selected + * they'll have positive lag by definition. + * + * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0. + */ +SCHED_FEAT(DELAY_DEQUEUE, true) +SCHED_FEAT(DELAY_ZERO, true) + +/* * Allow wakeup-time preemption of the current task: */ SCHED_FEAT(WAKEUP_PREEMPTION, true) @@ -85,5 +113,3 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) - -SCHED_FEAT(HZ_BW, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 6e78d071beb5..7a105a0123aa 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -450,43 +450,35 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + dl_server_update_idle_time(rq, prev); } static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) { update_idle_core(rq); schedstat_inc(rq->sched_goidle); + next->se.exec_start = rq_clock_task(rq); } -#ifdef CONFIG_SMP -static struct task_struct *pick_task_idle(struct rq *rq) +struct task_struct *pick_task_idle(struct rq *rq) { return rq->idle; } -#endif - -struct task_struct *pick_next_task_idle(struct rq *rq) -{ - struct task_struct *next = rq->idle; - - set_next_task_idle(rq, next, true); - - return next; -} /* * It is not legal to sleep in the idle task - print a warning * message if some code attempts to do it: */ -static void +static bool dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) { raw_spin_rq_unlock_irq(rq); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); dump_stack(); raw_spin_rq_lock_irq(rq); + return true; } /* @@ -528,13 +520,12 @@ DEFINE_SCHED_CLASS(idle) = { .wakeup_preempt = wakeup_preempt_idle, - .pick_next_task = pick_next_task_idle, + .pick_task = pick_task_idle, .put_prev_task = put_prev_task_idle, .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP .balance = balance_idle, - .pick_task = pick_task_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 310523c1b9e3..172c588de542 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -8,10 +8,6 @@ int sched_rr_timeslice = RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); - -struct rt_bandwidth def_rt_bandwidth; - /* * period over which we measure -rt task CPU usage in us. * default: 1s @@ -66,6 +62,40 @@ static int __init sched_rt_sysctl_init(void) late_initcall(sched_rt_sysctl_init); #endif +void init_rt_rq(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + rt_rq->highest_prio.next = MAX_RT_PRIO-1; + rt_rq->overloaded = 0; + plist_head_init(&rt_rq->pushable_tasks); +#endif /* CONFIG_SMP */ + /* We start is dequeued state, because no RT tasks are queued */ + rt_rq->rt_queued = 0; + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + raw_spin_lock_init(&rt_rq->rt_runtime_lock); +#endif +} + +#ifdef CONFIG_RT_GROUP_SCHED + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = @@ -130,35 +160,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) do_start_rt_bandwidth(rt_b); } -void init_rt_rq(struct rt_rq *rt_rq) -{ - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bit-search: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO-1; - rt_rq->highest_prio.next = MAX_RT_PRIO-1; - rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks); -#endif /* CONFIG_SMP */ - /* We start is dequeued state, because no RT tasks are queued */ - rt_rq->rt_queued = 0; - - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - raw_spin_lock_init(&rt_rq->rt_runtime_lock); -} - -#ifdef CONFIG_RT_GROUP_SCHED static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) { hrtimer_cancel(&rt_b->rt_period_timer); @@ -195,7 +196,6 @@ void unregister_rt_sched_group(struct task_group *tg) { if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); - } void free_rt_sched_group(struct task_group *tg) @@ -253,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!tg->rt_se) goto err; - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); + init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0); for_each_possible_cpu(i) { rt_rq = kzalloc_node(sizeof(struct rt_rq), @@ -604,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &rt_rq->tg->rt_bandwidth; } -#else /* !CONFIG_RT_GROUP_SCHED */ - -static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) -{ - return rt_rq->rt_runtime; -} - -static inline u64 sched_rt_period(struct rt_rq *rt_rq) -{ - return ktime_to_ns(def_rt_bandwidth.rt_period); -} - -typedef struct rt_rq *rt_rq_iter_t; - -#define for_each_rt_rq(rt_rq, iter, rq) \ - for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - -#define for_each_sched_rt_entity(rt_se) \ - for (; rt_se; rt_se = NULL) - -static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) -{ - return NULL; -} - -static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) -{ - struct rq *rq = rq_of_rt_rq(rt_rq); - - if (!rt_rq->rt_nr_running) - return; - - enqueue_top_rt_rq(rt_rq); - resched_curr(rq); -} - -static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -{ - dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); -} - -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} - -static inline -struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) -{ - return &cpu_rq(cpu)->rt; -} - -static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) -{ - return &def_rt_bandwidth; -} - -#endif /* CONFIG_RT_GROUP_SCHED */ - bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -859,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); -#ifdef CONFIG_RT_GROUP_SCHED + /* * FIXME: isolated CPUs should really leave the root task group, * whether they are isolcpus or were isolated via cpusets, lest @@ -871,7 +806,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) */ if (rt_b == &root_task_group.rt_bandwidth) span = cpu_online_mask; -#endif + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); @@ -938,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) return idle; } -static inline int rt_se_prio(struct sched_rt_entity *rt_se) -{ -#ifdef CONFIG_RT_GROUP_SCHED - struct rt_rq *rt_rq = group_rt_rq(rt_se); - - if (rt_rq) - return rt_rq->highest_prio.curr; -#endif - - return rt_task_of(rt_se)->prio; -} - static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -993,6 +916,72 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } +#else /* !CONFIG_RT_GROUP_SCHED */ + +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +#define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = NULL) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ + return NULL; +} + +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (!rt_rq->rt_nr_running) + return; + + enqueue_top_rt_rq(rt_rq); + resched_curr(rq); +} + +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return false; +} + +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_online_mask; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + +#ifdef CONFIG_SMP +static void __enable_runtime(struct rq *rq) { } +static void __disable_runtime(struct rq *rq) { } +#endif + +#endif /* CONFIG_RT_GROUP_SCHED */ + +static inline int rt_se_prio(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq) + return rt_rq->highest_prio.curr; +#endif + + return rt_task_of(rt_se)->prio; +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -1000,7 +989,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; - struct sched_rt_entity *rt_se = &curr->rt; s64 delta_exec; if (curr->sched_class != &rt_sched_class) @@ -1010,6 +998,9 @@ static void update_curr_rt(struct rq *rq) if (unlikely(delta_exec <= 0)) return; +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity *rt_se = &curr->rt; + if (!rt_bandwidth_enabled()) return; @@ -1028,6 +1019,7 @@ static void update_curr_rt(struct rq *rq) do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } +#endif } static void @@ -1184,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { - start_rt_bandwidth(&def_rt_bandwidth); } static inline @@ -1492,7 +1483,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_pushable_task(rq, p); } -static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) +static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; @@ -1500,6 +1491,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se, flags); dequeue_pushable_task(rq, p); + + return true; } /* @@ -1755,17 +1748,7 @@ static struct task_struct *pick_task_rt(struct rq *rq) return p; } -static struct task_struct *pick_next_task_rt(struct rq *rq) -{ - struct task_struct *p = pick_task_rt(rq); - - if (p) - set_next_task_rt(rq, p, true); - - return p; -} - -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next) { struct sched_rt_entity *rt_se = &p->rt; struct rt_rq *rt_rq = &rq->rt; @@ -2652,13 +2635,12 @@ DEFINE_SCHED_CLASS(rt) = { .wakeup_preempt = wakeup_preempt_rt, - .pick_next_task = pick_next_task_rt, + .pick_task = pick_task_rt, .put_prev_task = put_prev_task_rt, .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP .balance = balance_rt, - .pick_task = pick_task_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, @@ -2912,19 +2894,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) #ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = &cpu_rq(i)->rt; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = global_rt_runtime(); - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return 0; } #endif /* CONFIG_SYSCTL */ @@ -2944,12 +2913,6 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { - unsigned long flags; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4c36cc680361..3744f16a1293 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -68,6 +68,7 @@ #include <linux/wait_api.h> #include <linux/wait_bit.h> #include <linux/workqueue_api.h> +#include <linux/delayacct.h> #include <trace/events/power.h> #include <trace/events/sched.h> @@ -335,7 +336,7 @@ extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int dl_bw_check_overflow(int cpu); - +extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); /* * SCHED_DEADLINE supports servers (nested scheduling) with the following * interface: @@ -361,7 +362,14 @@ extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, - dl_server_pick_f pick); + dl_server_pick_f pick_task); + +extern void dl_server_update_idle_time(struct rq *rq, + struct task_struct *p); +extern void fair_server_init(struct rq *rq); +extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); +extern int dl_server_apply_params(struct sched_dl_entity *dl_se, + u64 runtime, u64 period, bool init); #ifdef CONFIG_CGROUP_SCHED @@ -599,17 +607,12 @@ struct cfs_rq { s64 avg_vruntime; u64 avg_load; - u64 exec_clock; u64 min_vruntime; #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; u64 min_vruntime_fi; #endif -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; -#endif - struct rb_root_cached tasks_timeline; /* @@ -619,10 +622,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -#endif - #ifdef CONFIG_SMP /* * CFS load tracking @@ -726,13 +725,13 @@ struct rt_rq { #endif /* CONFIG_SMP */ int rt_queued; +#ifdef CONFIG_RT_GROUP_SCHED int rt_throttled; u64 rt_time; u64 rt_runtime; /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; -#ifdef CONFIG_RT_GROUP_SCHED unsigned int rt_nr_boosted; struct rq *rq; @@ -820,6 +819,9 @@ static inline void se_update_runnable(struct sched_entity *se) static inline long se_runnable(struct sched_entity *se) { + if (se->sched_delayed) + return false; + if (entity_is_task(se)) return !!se->on_rq; else @@ -834,6 +836,9 @@ static inline void se_update_runnable(struct sched_entity *se) { } static inline long se_runnable(struct sched_entity *se) { + if (se->sched_delayed) + return false; + return !!se->on_rq; } @@ -1044,6 +1049,8 @@ struct rq { struct rt_rq rt; struct dl_rq dl; + struct sched_dl_entity fair_server; + #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ struct list_head leaf_cfs_rq_list; @@ -1059,6 +1066,7 @@ struct rq { unsigned int nr_uninterruptible; struct task_struct __rcu *curr; + struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; @@ -1158,7 +1166,6 @@ struct rq { /* latency stats */ struct sched_info rq_sched_info; unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_count; @@ -1187,6 +1194,7 @@ struct rq { /* per rq */ struct rq *core; struct task_struct *core_pick; + struct sched_dl_entity *core_dl_server; unsigned int core_enabled; unsigned int core_sched_seq; struct rb_root core_tree; @@ -2247,11 +2255,13 @@ extern const u32 sched_prio_to_wmult[40]; * */ -#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ +#define DEQUEUE_SPECIAL 0x10 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2267,6 +2277,7 @@ extern const u32 sched_prio_to_wmult[40]; #endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 +#define ENQUEUE_DELAYED 0x200 #define RETRY_TASK ((void *)-1UL) @@ -2285,23 +2296,31 @@ struct sched_class { #endif void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); bool (*yield_to_task)(struct rq *rq, struct task_struct *p); void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); - struct task_struct *(*pick_next_task)(struct rq *rq); + struct task_struct *(*pick_task)(struct rq *rq); + /* + * Optional! When implemented pick_next_task() should be equivalent to: + * + * next = pick_task(); + * if (next) { + * put_prev_task(prev); + * set_next_task_first(next); + * } + */ + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); - void (*put_prev_task)(struct rq *rq, struct task_struct *p); + void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); #ifdef CONFIG_SMP int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); - struct task_struct * (*pick_task)(struct rq *rq); - void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); @@ -2345,7 +2364,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev); + prev->sched_class->put_prev_task(rq, prev, NULL); } static inline void set_next_task(struct rq *rq, struct task_struct *next) @@ -2353,6 +2372,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } +static inline void +__put_prev_set_next_dl_server(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + prev->dl_server = NULL; + next->dl_server = rq->dl_server; + rq->dl_server = NULL; +} + +static inline void put_prev_set_next_task(struct rq *rq, + struct task_struct *prev, + struct task_struct *next) +{ + WARN_ON_ONCE(rq->curr != prev); + + __put_prev_set_next_dl_server(rq, prev, next); + + if (next == prev) + return; + + prev->sched_class->put_prev_task(rq, prev, next); + next->sched_class->set_next_task(rq, next, true); +} /* * Helper to define a sched_class instance; each one is placed in a separate @@ -2408,7 +2451,7 @@ static inline bool sched_fair_runnable(struct rq *rq) } extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_next_task_idle(struct rq *rq); +extern struct task_struct *pick_task_idle(struct rq *rq); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 @@ -2515,7 +2558,6 @@ extern void reweight_task(struct task_struct *p, const struct load_weight *lw); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); -extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @@ -2586,6 +2628,19 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) sched_update_tick_dependency(rq); } +static inline void __block_task(struct rq *rq, struct task_struct *p) +{ + WRITE_ONCE(p->on_rq, 0); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + if (p->sched_contributes_to_load) + rq->nr_uninterruptible++; + + if (p->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } +} + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); @@ -3607,7 +3662,7 @@ extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *c extern void __setscheduler_prio(struct task_struct *p, int prio); extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); -extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags); +extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); extern void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index b1b8fe61c532..058dd42e3d9b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -41,26 +41,17 @@ static struct task_struct *pick_task_stop(struct rq *rq) return rq->stop; } -static struct task_struct *pick_next_task_stop(struct rq *rq) -{ - struct task_struct *p = pick_task_stop(rq); - - if (p) - set_next_task_stop(rq, p, true); - - return p; -} - static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); } -static void +static bool dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + return true; } static void yield_task_stop(struct rq *rq) @@ -68,7 +59,7 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next) { update_curr_common(rq); } @@ -111,13 +102,12 @@ DEFINE_SCHED_CLASS(stop) = { .wakeup_preempt = wakeup_preempt_stop, - .pick_next_task = pick_next_task_stop, + .pick_task = pick_task_stop, .put_prev_task = put_prev_task_stop, .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP .balance = balance_stop, - .pick_task = pick_task_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index ae1b42775ef9..cb03c790c27a 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -57,7 +57,7 @@ static int effective_prio(struct task_struct *p) * keep the priority unchanged. Otherwise, update priority * to the normal priority: */ - if (!rt_prio(p->prio)) + if (!rt_or_dl_prio(p->prio)) return p->normal_prio; return p->prio; } @@ -258,107 +258,6 @@ int sched_core_idle_cpu(int cpu) #endif -#ifdef CONFIG_SMP -/* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. - * - * The scheduler tracks the following metrics: - * - * cpu_util_{cfs,rt,dl,irq}() - * cpu_bw_dl() - * - * Where the cfs,rt and dl util numbers are tracked with the same metric and - * synchronized windows and are thus directly comparable. - * - * The cfs,rt,dl utilization are the running times measured with rq->clock_task - * which excludes things like IRQ and steal-time. These latter are then accrued - * in the IRQ utilization. - * - * The DL bandwidth number OTOH is not a measured metric but a value computed - * based on the task model parameters and gives the minimal utilization - * required to meet deadlines. - */ -unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - unsigned long *min, - unsigned long *max) -{ - unsigned long util, irq, scale; - struct rq *rq = cpu_rq(cpu); - - scale = arch_scale_cpu_capacity(cpu); - - /* - * Early check to see if IRQ/steal time saturates the CPU, can be - * because of inaccuracies in how we track these -- see - * update_irq_load_avg(). - */ - irq = cpu_util_irq(rq); - if (unlikely(irq >= scale)) { - if (min) - *min = scale; - if (max) - *max = scale; - return scale; - } - - if (min) { - /* - * The minimum utilization returns the highest level between: - * - the computed DL bandwidth needed with the IRQ pressure which - * steals time to the deadline task. - * - The minimum performance requirement for CFS and/or RT. - */ - *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); - - /* - * When an RT task is runnable and uclamp is not used, we must - * ensure that the task will run at maximum compute capacity. - */ - if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) - *min = max(*min, scale); - } - - /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. - */ - util = util_cfs + cpu_util_rt(rq); - util += cpu_util_dl(rq); - - /* - * The maximum hint is a soft bandwidth requirement, which can be lower - * than the actual utilization because of uclamp_max requirements. - */ - if (max) - *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); - - if (util >= scale) - return scale; - - /* - * There is still idle time; further improve the number by using the - * IRQ metric. Because IRQ/steal time is hidden from the task clock we - * need to scale the task numbers: - * - * max - irq - * U' = irq + --------- * U - * max - */ - util = scale_irq_capacity(util, irq, scale); - util += irq; - - return min(scale, util); -} - -unsigned long sched_cpu_util(int cpu) -{ - return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); -} -#endif /* CONFIG_SMP */ - /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. @@ -401,10 +300,28 @@ static void __setscheduler_params(struct task_struct *p, p->policy = policy; - if (dl_policy(policy)) + if (dl_policy(policy)) { __setparam_dl(p, attr); - else if (fair_policy(policy)) + } else if (fair_policy(policy)) { p->static_prio = NICE_TO_PRIO(attr->sched_nice); + if (attr->sched_runtime) { + p->se.custom_slice = 1; + p->se.slice = clamp_t(u64, attr->sched_runtime, + NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ + NSEC_PER_MSEC*100); /* HZ=100 / 10 */ + } else { + p->se.custom_slice = 0; + p->se.slice = sysctl_sched_base_slice; + } + } + + /* rt-policy tasks do not have a timerslack */ + if (rt_or_dl_task_policy(p)) { + p->timer_slack_ns = 0; + } else if (p->timer_slack_ns == 0) { + /* when switching back to non-rt policy, restore timerslack */ + p->timer_slack_ns = p->default_timer_slack_ns; + } /* * __sched_setscheduler() ensures attr->sched_priority == 0 when @@ -700,7 +617,9 @@ recheck: * but store a possible modification of reset_on_fork. */ if (unlikely(policy == p->policy)) { - if (fair_policy(policy) && attr->sched_nice != task_nice(p)) + if (fair_policy(policy) && + (attr->sched_nice != task_nice(p) || + (attr->sched_runtime != p->se.slice))) goto change; if (rt_policy(policy) && attr->sched_priority != p->rt_priority) goto change; @@ -846,6 +765,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy, .sched_nice = PRIO_TO_NICE(p->static_prio), }; + if (p->se.custom_slice) + attr.sched_runtime = p->se.slice; + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; @@ -1012,12 +934,14 @@ err_size: static void get_params(struct task_struct *p, struct sched_attr *attr) { - if (task_has_dl_policy(p)) + if (task_has_dl_policy(p)) { __getparam_dl(p, attr); - else if (task_has_rt_policy(p)) + } else if (task_has_rt_policy(p)) { attr->sched_priority = p->rt_priority; - else + } else { attr->sched_nice = task_nice(p); + attr->sched_runtime = p->se.slice; + } } /** diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 76504b776d03..9748a4c8d668 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -516,6 +516,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); + /* + * Because the rq is not a task, dl_add_task_root_domain() did not + * move the fair server bw to the rd if it already started. + * Add it now. + */ + if (rq->fair_server.dl_server) + __dl_server_attach_root(&rq->fair_server, rq); + rq_unlock_irqrestore(rq, &rf); if (old_rd) diff --git a/kernel/signal.c b/kernel/signal.c index 60c737e423a1..6fe29715105b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -618,20 +618,18 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, } /* - * Dequeue a signal and return the element to the caller, which is - * expected to free it. - * - * All callers have to hold the siglock. + * Try to dequeue a signal. If a deliverable signal is found fill in the + * caller provided siginfo and return the signal number. Otherwise return + * 0. */ -int dequeue_signal(struct task_struct *tsk, sigset_t *mask, - kernel_siginfo_t *info, enum pid_type *type) +int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) { + struct task_struct *tsk = current; bool resched_timer = false; int signr; - /* We only dequeue private signals from ourselves, we don't let - * signalfd steal them - */ + lockdep_assert_held(&tsk->sighand->siglock); + *type = PIDTYPE_PID; signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { @@ -1940,10 +1938,11 @@ struct sigqueue *sigqueue_alloc(void) void sigqueue_free(struct sigqueue *q) { - unsigned long flags; spinlock_t *lock = ¤t->sighand->siglock; + unsigned long flags; - BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); + if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC))) + return; /* * We must hold ->siglock while testing q->list * to serialize with collect_signal() or with @@ -1971,7 +1970,10 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) unsigned long flags; int ret, result; - BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); + if (WARN_ON_ONCE(!(q->flags & SIGQUEUE_PREALLOC))) + return 0; + if (WARN_ON_ONCE(q->info.si_code != SI_TIMER)) + return 0; ret = -1; rcu_read_lock(); @@ -2006,7 +2008,6 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) * If an SI_TIMER entry is already queue just increment * the overrun count. */ - BUG_ON(q->info.si_code != SI_TIMER); q->info.si_overrun++; result = TRACE_SIGNAL_ALREADY_PENDING; goto out; @@ -2793,8 +2794,7 @@ relock: type = PIDTYPE_PID; signr = dequeue_synchronous_signal(&ksig->info); if (!signr) - signr = dequeue_signal(current, ¤t->blocked, - &ksig->info, &type); + signr = dequeue_signal(¤t->blocked, &ksig->info, &type); if (!signr) break; /* will return 0 */ @@ -2888,6 +2888,8 @@ relock: current->flags |= PF_SIGNALED; if (sig_kernel_coredump(signr)) { + int ret; + if (print_fatal_signals) print_fatal_signal(signr); proc_coredump_connector(current); @@ -2899,7 +2901,24 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(&ksig->info); + ret = do_coredump(&ksig->info); + if (ret) + coredump_report_failure("coredump has not been created, error %d", + ret); + else if (!IS_ENABLED(CONFIG_COREDUMP)) { + /* + * Coredumps are not available, can't fail collecting + * the coredump. + * + * Leave a note though that the coredump is going to be + * not created. This is not an error or a warning as disabling + * support in the kernel for coredumps isn't commonplace, and + * the user must've built the kernel with the custom config so + * let them know all works as desired. + */ + coredump_report("no coredump collected as " + "that is disabled in the kernel configuration"); + } } /* @@ -3648,7 +3667,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, signotset(&mask); spin_lock_irq(&tsk->sighand->siglock); - sig = dequeue_signal(tsk, &mask, info, &type); + sig = dequeue_signal(&mask, info, &type); if (!sig && timeout) { /* * None ready, temporarily unblock those we're interested @@ -3667,7 +3686,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); - sig = dequeue_signal(tsk, &mask, info, &type); + sig = dequeue_signal(&mask, info, &type); } spin_unlock_irq(&tsk->sighand->siglock); diff --git a/kernel/smp.c b/kernel/smp.c index aaffecdad319..f25e20617b7e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -208,12 +208,25 @@ static int csd_lock_wait_getcpu(call_single_data_t *csd) return -1; } +static atomic_t n_csd_lock_stuck; + +/** + * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? + * + * Returns @true if a CSD-lock acquisition is stuck and has been stuck + * long enough for a "non-responsive CSD lock" message to be printed. + */ +bool csd_lock_is_stuck(void) +{ + return !!atomic_read(&n_csd_lock_stuck); +} + /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages) { int cpu = -1; int cpux; @@ -229,15 +242,26 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); + atomic_dec(&n_csd_lock_stuck); return true; } ts2 = sched_clock(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; - if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) + if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * + (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) || + csd_lock_timeout_ns == 0)) return false; + if (ts0 > ts2) { + /* Our own sched_clock went backward; don't blame another CPU. */ + ts_delta = ts0 - ts2; + pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta); + *ts1 = ts2; + return false; + } + firsttime = !*bug_id; if (firsttime) *bug_id = atomic_inc_return(&csd_bug_count); @@ -249,9 +273,12 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ /* How long since this CSD lock was stuck. */ ts_delta = ts2 - ts0; - pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", - firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta, + pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); + (*nmessages)++; + if (firsttime) + atomic_inc(&n_csd_lock_stuck); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely * to become unstuck. Use a signed comparison to avoid triggering @@ -290,12 +317,13 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in */ static void __csd_lock_wait(call_single_data_t *csd) { + unsigned long nmessages = 0; int bug_id = 0; u64 ts0, ts1; ts1 = ts0 = sched_clock(); for (;;) { - if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id)) + if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; cpu_relax(); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 02582017759a..d082e7840f88 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -551,7 +551,7 @@ restart: kstat_incr_softirqs_this_cpu(vec_nr); trace_softirq_entry(vec_nr); - h->action(h); + h->action(); trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", @@ -700,7 +700,7 @@ void __raise_softirq_irqoff(unsigned int nr) or_softirq_pending(1UL << nr); } -void open_softirq(int nr, void (*action)(struct softirq_action *)) +void open_softirq(int nr, void (*action)(void)) { softirq_vec[nr].action = action; } @@ -760,8 +760,7 @@ static bool tasklet_clear_sched(struct tasklet_struct *t) return false; } -static void tasklet_action_common(struct softirq_action *a, - struct tasklet_head *tl_head, +static void tasklet_action_common(struct tasklet_head *tl_head, unsigned int softirq_nr) { struct tasklet_struct *list; @@ -805,16 +804,16 @@ static void tasklet_action_common(struct softirq_action *a, } } -static __latent_entropy void tasklet_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(void) { workqueue_softirq_action(false); - tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); + tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); } -static __latent_entropy void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_hi_action(void) { workqueue_softirq_action(true); - tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); + tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } void tasklet_setup(struct tasklet_struct *t, diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index cedb17ba158a..da821ce258ea 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -251,7 +251,7 @@ static int multi_cpu_stop(void *data) */ touch_nmi_watchdog(); } - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); } while (curstate != MULTI_STOP_EXIT); local_irq_restore(flags); diff --git a/kernel/sys.c b/kernel/sys.c index 3a2df1bd9f64..b7e096e1c3a1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2557,6 +2557,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: + if (rt_or_dl_task_policy(current)) + break; if (arg2 <= 0) current->timer_slack_ns = current->default_timer_slack_ns; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5abfa4390673..8bf888641694 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -493,7 +493,7 @@ static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throt * promised in the context of posix_timer_fn() never * materialized, but someone should really work on it. * - * To prevent DOS fake @now to be 1 jiffie out which keeps + * To prevent DOS fake @now to be 1 jiffy out which keeps * the overrun accounting correct but creates an * inconsistency vs. timer_gettime(2). */ @@ -574,15 +574,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, it.alarm.alarmtimer); enum alarmtimer_restart result = ALARMTIMER_NORESTART; unsigned long flags; - int si_private = 0; spin_lock_irqsave(&ptr->it_lock, flags); - ptr->it_active = 0; - if (ptr->it_interval) - si_private = ++ptr->it_requeue_pending; - - if (posix_timer_event(ptr, si_private) && ptr->it_interval) { + if (posix_timer_queue_signal(ptr) && ptr->it_interval) { /* * Handle ignored signals and rearm the timer. This will go * away once we handle ignored signals proper. Ensure that diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 60a6484831b1..78c7bd64d0dd 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -190,7 +190,7 @@ int clockevents_tick_resume(struct clock_event_device *dev) #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST -/* Limit min_delta to a jiffie */ +/* Limit min_delta to a jiffy */ #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) /** diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d0538a75f4c6..23336eecb4f4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -113,7 +113,6 @@ static u64 suspend_start; /* * Threshold: 0.0312s, when doubled: 0.0625s. - * Also a default for cs->uncertainty_margin when registering clocks. */ #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) @@ -125,6 +124,13 @@ static u64 suspend_start; * * The default of 500 parts per million is based on NTP's limits. * If a clocksource is good enough for NTP, it is good enough for us! + * + * In other words, by default, even if a clocksource is extremely + * precise (for example, with a sub-nanosecond period), the maximum + * permissible skew between the clocksource watchdog and the clocksource + * under test is not permitted to go below the 500ppm minimum defined + * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the + * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option. */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US #define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US @@ -132,6 +138,13 @@ static u64 suspend_start; #define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) #endif +/* + * Default for maximum permissible skew when cs->uncertainty_margin is + * not specified, and the lower bound even when cs->uncertainty_margin + * is specified. This is also the default that is used when registering + * clocks with unspecifed cs->uncertainty_margin, so this macro is used + * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. + */ #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) #ifdef CONFIG_CLOCKSOURCE_WATCHDOG @@ -231,6 +244,7 @@ enum wd_read_status { static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) { + int64_t md = 2 * watchdog->uncertainty_margin; unsigned int nretries, max_retries; int64_t wd_delay, wd_seq_delay; u64 wd_end, wd_end2; @@ -245,7 +259,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, local_irq_enable(); wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); - if (wd_delay <= WATCHDOG_MAX_SKEW) { + if (wd_delay <= md + cs->uncertainty_margin) { if (nretries > 1 && nretries >= max_retries) { pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", smp_processor_id(), watchdog->name, nretries); @@ -258,12 +272,12 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, * there is too much external interferences that cause * significant delay in reading both clocksource and watchdog. * - * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2, - * report system busy, reinit the watchdog and skip the current + * If consecutive WD read-back delay > md, report + * system busy, reinit the watchdog and skip the current * watchdog test. */ wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2); - if (wd_seq_delay > WATCHDOG_MAX_SKEW/2) + if (wd_seq_delay > md) goto skip_test; } @@ -1146,14 +1160,19 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq } /* - * If the uncertainty margin is not specified, calculate it. - * If both scale and freq are non-zero, calculate the clock - * period, but bound below at 2*WATCHDOG_MAX_SKEW. However, - * if either of scale or freq is zero, be very conservative and - * take the tens-of-milliseconds WATCHDOG_THRESHOLD value for the - * uncertainty margin. Allow stupidly small uncertainty margins - * to be specified by the caller for testing purposes, but warn - * to discourage production use of this capability. + * If the uncertainty margin is not specified, calculate it. If + * both scale and freq are non-zero, calculate the clock period, but + * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default. + * However, if either of scale or freq is zero, be very conservative + * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value + * for the uncertainty margin. Allow stupidly small uncertainty + * margins to be specified by the caller for testing purposes, + * but warn to discourage production use of this capability. + * + * Bottom line: The sum of the uncertainty margins of the + * watchdog clocksource and the clocksource under test will be at + * least 500ppm by default. For more information, please see the + * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above. */ if (scale && freq && !cs->uncertainty_margin) { cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b8ee320208d4..cddcd08ea827 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1177,7 +1177,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution - * (i.e. one jiffie) to prevent short timeouts. + * (i.e. one jiffy) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) @@ -1351,11 +1351,13 @@ static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) + __acquires(&base->softirq_expiry_lock) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) + __releases(&base->softirq_expiry_lock) { spin_unlock(&base->softirq_expiry_lock); } @@ -1757,7 +1759,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, } } -static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +static __latent_entropy void hrtimer_run_softirq(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; @@ -1975,7 +1977,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, * expiry. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { - if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) + if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) mode |= HRTIMER_MODE_HARD; } @@ -2072,14 +2074,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - u64 slack; - - slack = current->timer_slack_ns; - if (rt_task(current)) - slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); + hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -2249,7 +2246,7 @@ void __init hrtimers_init(void) /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks + * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used */ @@ -2276,13 +2273,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } - /* - * Override any slack passed by the user if under - * rt contraints. - */ - if (rt_task(current)) - delta = 0; - hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); @@ -2302,7 +2292,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) - * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks + * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8d2dd214ec68..802b336f4b8c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -660,9 +660,17 @@ rearm: sched_sync_hw_clock(offset_nsec, res != 0); } -void ntp_notify_cmos_timer(void) +void ntp_notify_cmos_timer(bool offset_set) { /* + * If the time jumped (using ADJ_SETOFFSET) cancels sync timer, + * which may have been running if the time was synchronized + * prior to the ADJ_SETOFFSET call. + */ + if (offset_set) + hrtimer_cancel(&sync_hrtimer); + + /* * When the work is currently executed but has not yet the timer * rearmed this queues the work immediately again. No big issue, * just a pointless work scheduled. diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 23d1b74c3065..5a633dce9057 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -14,9 +14,9 @@ extern int __do_adjtimex(struct __kernel_timex *txc, extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) -extern void ntp_notify_cmos_timer(void); +extern void ntp_notify_cmos_timer(bool offset_set); #else -static inline void ntp_notify_cmos_timer(void) { } +static inline void ntp_notify_cmos_timer(bool offset_set) { } #endif #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e9c6f9d0e42c..6bcee4704059 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -453,6 +453,7 @@ static void disarm_timer(struct k_itimer *timer, struct task_struct *p) struct cpu_timer *ctmr = &timer->it.cpu; struct posix_cputimer_base *base; + timer->it_active = 0; if (!cpu_timer_dequeue(ctmr)) return; @@ -559,6 +560,7 @@ static void arm_timer(struct k_itimer *timer, struct task_struct *p) struct cpu_timer *ctmr = &timer->it.cpu; u64 newexp = cpu_timer_getexpires(ctmr); + timer->it_active = 1; if (!cpu_timer_enqueue(&base->tqhead, ctmr)) return; @@ -584,12 +586,8 @@ static void cpu_timer_fire(struct k_itimer *timer) { struct cpu_timer *ctmr = &timer->it.cpu; - if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - /* - * User don't want any signal. - */ - cpu_timer_setexpires(ctmr, 0); - } else if (unlikely(timer->sigq == NULL)) { + timer->it_active = 0; + if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. @@ -600,9 +598,9 @@ static void cpu_timer_fire(struct k_itimer *timer) /* * One-shot timer. Clear it as soon as it's fired. */ - posix_timer_event(timer, 0); + posix_timer_queue_signal(timer); cpu_timer_setexpires(ctmr, 0); - } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { + } else if (posix_timer_queue_signal(timer)) { /* * The signal did not get queued because the signal * was ignored, so we won't get any callback to @@ -614,6 +612,8 @@ static void cpu_timer_fire(struct k_itimer *timer) } } +static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now); + /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -623,9 +623,10 @@ static void cpu_timer_fire(struct k_itimer *timer) static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, struct itimerspec64 *new, struct itimerspec64 *old) { + bool sigev_none = timer->it_sigev_notify == SIGEV_NONE; clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - u64 old_expires, new_expires, old_incr, val; struct cpu_timer *ctmr = &timer->it.cpu; + u64 old_expires, new_expires, now; struct sighand_struct *sighand; struct task_struct *p; unsigned long flags; @@ -662,10 +663,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, return -ESRCH; } - /* - * Disarm any old timer after extracting its expiry time. - */ - old_incr = timer->it_interval; + /* Retrieve the current expiry time before disarming the timer */ old_expires = cpu_timer_getexpires(ctmr); if (unlikely(timer->it.cpu.firing)) { @@ -673,157 +671,122 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, ret = TIMER_RETRY; } else { cpu_timer_dequeue(ctmr); + timer->it_active = 0; } /* - * We need to sample the current value to convert the new - * value from to relative and absolute, and to convert the - * old value from absolute to relative. To set a process - * timer, we need a sample to balance the thread expiry - * times (in arm_timer). With an absolute time, we must - * check if it's already passed. In short, we need a sample. + * Sample the current clock for saving the previous setting + * and for rearming the timer. */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) - val = cpu_clock_sample(clkid, p); + now = cpu_clock_sample(clkid, p); else - val = cpu_clock_sample_group(clkid, p, true); + now = cpu_clock_sample_group(clkid, p, !sigev_none); + /* Retrieve the previous expiry value if requested. */ if (old) { - if (old_expires == 0) { - old->it_value.tv_sec = 0; - old->it_value.tv_nsec = 0; - } else { - /* - * Update the timer in case it has overrun already. - * If it has, we'll report it as having overrun and - * with the next reloaded timer already ticking, - * though we are swallowing that pending - * notification here to install the new setting. - */ - u64 exp = bump_cpu_timer(timer, val); - - if (val < exp) { - old_expires = exp - val; - old->it_value = ns_to_timespec64(old_expires); - } else { - old->it_value.tv_nsec = 1; - old->it_value.tv_sec = 0; - } - } + old->it_value = (struct timespec64){ }; + if (old_expires) + __posix_cpu_timer_get(timer, old, now); } + /* Retry if the timer expiry is running concurrently */ if (unlikely(ret)) { - /* - * We are colliding with the timer actually firing. - * Punt after filling in the timer's old value, and - * disable this firing since we are already reporting - * it as an overrun (thanks to bump_cpu_timer above). - */ unlock_task_sighand(p, &flags); goto out; } - if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) { - new_expires += val; - } + /* Convert relative expiry time to absolute */ + if (new_expires && !(timer_flags & TIMER_ABSTIME)) + new_expires += now; + + /* Set the new expiry time (might be 0) */ + cpu_timer_setexpires(ctmr, new_expires); /* - * Install the new expiry time (or zero). - * For a timer with no notification action, we don't actually - * arm the timer (we'll just fake it for timer_gettime). + * Arm the timer if it is not disabled, the new expiry value has + * not yet expired and the timer requires signal delivery. + * SIGEV_NONE timers are never armed. In case the timer is not + * armed, enforce the reevaluation of the timer base so that the + * process wide cputime counter can be disabled eventually. */ - cpu_timer_setexpires(ctmr, new_expires); - if (new_expires != 0 && val < new_expires) { - arm_timer(timer, p); + if (likely(!sigev_none)) { + if (new_expires && now < new_expires) + arm_timer(timer, p); + else + trigger_base_recalc_expires(timer, p); } unlock_task_sighand(p, &flags); + + posix_timer_set_common(timer, new); + /* - * Install the new reload setting, and - * set up the signal and overrun bookkeeping. + * If the new expiry time was already in the past the timer was not + * queued. Fire it immediately even if the thread never runs to + * accumulate more time on this clock. */ - timer->it_interval = timespec64_to_ktime(new->it_interval); + if (!sigev_none && new_expires && now >= new_expires) + cpu_timer_fire(timer); +out: + rcu_read_unlock(); + return ret; +} + +static void __posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp, u64 now) +{ + bool sigev_none = timer->it_sigev_notify == SIGEV_NONE; + u64 expires, iv = timer->it_interval; /* - * This acts as a modification timestamp for the timer, - * so any automatic reload attempt will punt on seeing - * that we have reset the timer manually. + * Make sure that interval timers are moved forward for the + * following cases: + * - SIGEV_NONE timers which are never armed + * - Timers which expired, but the signal has not yet been + * delivered */ - timer->it_requeue_pending = (timer->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timer->it_overrun_last = 0; - timer->it_overrun = -1; - - if (val >= new_expires) { - if (new_expires != 0) { - /* - * The designated time already passed, so we notify - * immediately, even if the thread never runs to - * accumulate more time on this clock. - */ - cpu_timer_fire(timer); - } + if (iv && ((timer->it_requeue_pending & REQUEUE_PENDING) || sigev_none)) + expires = bump_cpu_timer(timer, now); + else + expires = cpu_timer_getexpires(&timer->it.cpu); + /* + * Expired interval timers cannot have a remaining time <= 0. + * The kernel has to move them forward so that the next + * timer expiry is > @now. + */ + if (now < expires) { + itp->it_value = ns_to_timespec64(expires - now); + } else { /* - * Make sure we don't keep around the process wide cputime - * counter or the tick dependency if they are not necessary. + * A single shot SIGEV_NONE timer must return 0, when it is + * expired! Timers which have a real signal delivery mode + * must return a remaining time greater than 0 because the + * signal has not yet been delivered. */ - sighand = lock_task_sighand(p, &flags); - if (!sighand) - goto out; - - if (!cpu_timer_queued(ctmr)) - trigger_base_recalc_expires(timer, p); - - unlock_task_sighand(p, &flags); + if (!sigev_none) + itp->it_value.tv_nsec = 1; } - out: - rcu_read_unlock(); - if (old) - old->it_interval = ns_to_timespec64(old_incr); - - return ret; } static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct cpu_timer *ctmr = &timer->it.cpu; - u64 now, expires = cpu_timer_getexpires(ctmr); struct task_struct *p; + u64 now; rcu_read_lock(); p = cpu_timer_task_rcu(timer); - if (!p) - goto out; + if (p && cpu_timer_getexpires(&timer->it.cpu)) { + itp->it_interval = ktime_to_timespec64(timer->it_interval); - /* - * Easy part: convert the reload time. - */ - itp->it_interval = ktime_to_timespec64(timer->it_interval); - - if (!expires) - goto out; - - /* - * Sample the clock to take the difference with the expiry time. - */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) - now = cpu_clock_sample(clkid, p); - else - now = cpu_clock_sample_group(clkid, p, false); + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + now = cpu_clock_sample(clkid, p); + else + now = cpu_clock_sample_group(clkid, p, false); - if (now < expires) { - itp->it_value = ns_to_timespec64(expires - now); - } else { - /* - * The timer should have expired already, but the firing - * hasn't taken place yet. Say it's just about to expire. - */ - itp->it_value.tv_nsec = 1; - itp->it_value.tv_sec = 0; + __posix_cpu_timer_get(timer, itp, now); } -out: rcu_read_unlock(); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b924f0f096fa..4576aaed13b2 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -277,10 +277,17 @@ void posixtimer_rearm(struct kernel_siginfo *info) unlock_timer(timr, flags); } -int posix_timer_event(struct k_itimer *timr, int si_private) +int posix_timer_queue_signal(struct k_itimer *timr) { + int ret, si_private = 0; enum pid_type type; - int ret; + + lockdep_assert_held(&timr->it_lock); + + timr->it_active = 0; + if (timr->it_interval) + si_private = ++timr->it_requeue_pending; + /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). @@ -309,19 +316,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { + struct k_itimer *timr = container_of(timer, struct k_itimer, it.real.timer); enum hrtimer_restart ret = HRTIMER_NORESTART; - struct k_itimer *timr; unsigned long flags; - int si_private = 0; - timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); - timr->it_active = 0; - if (timr->it_interval != 0) - si_private = ++timr->it_requeue_pending; - - if (posix_timer_event(timr, si_private)) { + if (posix_timer_queue_signal(timr)) { /* * The signal was not queued due to SIG_IGN. As a * consequence the timer is not going to be rearmed from @@ -338,14 +339,14 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) * change to the signal handling code. * * For now let timers with an interval less than a - * jiffie expire every jiffie and recheck for a + * jiffy expire every jiffy and recheck for a * valid signal handler. * * This avoids interrupt starvation in case of a * very small interval, which would expire the * timer immediately again. * - * Moving now ahead of time by one jiffie tricks + * Moving now ahead of time by one jiffy tricks * hrtimer_forward() to expire the timer later, * while it still maintains the overrun accuracy * for the price of a slight inconsistency in the @@ -515,7 +516,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, spin_lock_irq(¤t->sighand->siglock); /* This makes the timer valid in the hash table */ WRITE_ONCE(new_timer->it_signal, current->signal); - list_add(&new_timer->list, ¤t->signal->posix_timers); + hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); /* * After unlocking sighand::siglock @new_timer is subject to @@ -856,6 +857,23 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer, return lock_timer(timer_id, flags); } +/* + * Set up the new interval and reset the signal delivery data + */ +void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting) +{ + if (new_setting->it_value.tv_sec || new_setting->it_value.tv_nsec) + timer->it_interval = timespec64_to_ktime(new_setting->it_interval); + else + timer->it_interval = 0; + + /* Prevent reloading in case there is a signal pending */ + timer->it_requeue_pending = (timer->it_requeue_pending + 2) & ~REQUEUE_PENDING; + /* Reset overrun accounting */ + timer->it_overrun_last = 0; + timer->it_overrun = -1LL; +} + /* Set a POSIX.1b interval timer. */ int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, @@ -878,15 +896,12 @@ int common_timer_set(struct k_itimer *timr, int flags, return TIMER_RETRY; timr->it_active = 0; - timr->it_requeue_pending = (timr->it_requeue_pending + 2) & - ~REQUEUE_PENDING; - timr->it_overrun_last = 0; + posix_timer_set_common(timr, new_setting); - /* Switch off the timer when it_value is zero */ + /* Keep timer disarmed when it_value is zero */ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; - timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); if (flags & TIMER_ABSTIME) expires = timens_ktime_to_host(timr->it_clock, expires); @@ -904,7 +919,7 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, const struct k_clock *kc; struct k_itimer *timr; unsigned long flags; - int error = 0; + int error; if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) @@ -918,6 +933,9 @@ retry: if (!timr) return -EINVAL; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; @@ -1021,7 +1039,7 @@ retry_delete: } spin_lock(¤t->sighand->siglock); - list_del(&timer->list); + hlist_del(&timer->list); spin_unlock(¤t->sighand->siglock); /* * A concurrent lookup could check timer::it_signal lockless. It @@ -1071,7 +1089,7 @@ retry_delete: goto retry_delete; } - list_del(&timer->list); + hlist_del(&timer->list); /* * Setting timer::it_signal to NULL is technically not required @@ -1092,22 +1110,19 @@ retry_delete: */ void exit_itimers(struct task_struct *tsk) { - struct list_head timers; - struct k_itimer *tmr; + struct hlist_head timers; - if (list_empty(&tsk->signal->posix_timers)) + if (hlist_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ spin_lock_irq(&tsk->sighand->siglock); - list_replace_init(&tsk->signal->posix_timers, &timers); + hlist_move_list(&tsk->signal->posix_timers, &timers); spin_unlock_irq(&tsk->sighand->siglock); /* The timers are not longer accessible via tsk::signal */ - while (!list_empty(&timers)) { - tmr = list_first_entry(&timers, struct k_itimer, list); - itimer_delete(tmr); - } + while (!hlist_empty(&timers)) + itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); } SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index f32a2ebba9b8..4784ea65f685 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -36,10 +36,11 @@ extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; -int posix_timer_event(struct k_itimer *timr, int si_private); +int posix_timer_queue_signal(struct k_itimer *timr); void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting); int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, struct itimerspec64 *old_setting); +void posix_timer_set_common(struct k_itimer *timer, struct itimerspec64 *new_setting); int common_timer_del(struct k_itimer *timer); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5391e4167d60..7e6f409bf311 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2553,6 +2553,7 @@ int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; struct audit_ntp_data ad; + bool offset_set = false; bool clock_set = false; struct timespec64 ts; unsigned long flags; @@ -2575,6 +2576,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (ret) return ret; + offset_set = delta.tv_sec != 0; audit_tk_injoffset(delta); } @@ -2608,7 +2610,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (clock_set) clock_was_set(CLOCK_SET_WALL); - ntp_notify_cmos_timer(); + ntp_notify_cmos_timer(offset_set); return ret; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 64b0d8a0aa0f..0fc9d066a7be 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -365,7 +365,7 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, rem = j % HZ; /* - * If the target jiffie is just after a whole second (which can happen + * If the target jiffy is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. @@ -672,7 +672,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, * Set the next expiry time and kick the CPU so it * can reevaluate the wheel: */ - base->next_expiry = bucket_expiry; + WRITE_ONCE(base->next_expiry, bucket_expiry); base->timers_pending = true; base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); @@ -1561,6 +1561,8 @@ static inline void timer_base_unlock_expiry(struct timer_base *base) * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) + __releases(&base->lock) __releases(&base->expiry_lock) + __acquires(&base->expiry_lock) __acquires(&base->lock) { if (atomic_read(&base->timer_waiters)) { raw_spin_unlock_irq(&base->lock); @@ -1898,7 +1900,7 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset, * * Store next expiry time in base->next_expiry. */ -static void next_expiry_recalc(struct timer_base *base) +static void timer_recalc_next_expiry(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; @@ -1928,7 +1930,7 @@ static void next_expiry_recalc(struct timer_base *base) * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next - * expiring jiffie. So in case of: + * expiring jiffy. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 @@ -1964,7 +1966,7 @@ static void next_expiry_recalc(struct timer_base *base) clk += adj; } - base->next_expiry = next; + WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); } @@ -1993,7 +1995,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) return basem; /* - * Round up to the next jiffie. High resolution timers are + * Round up to the next jiffy. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. @@ -2007,7 +2009,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, unsigned long basej) { if (base->next_expiry_recalc) - next_expiry_recalc(base); + timer_recalc_next_expiry(base); /* * Move next_expiry for the empty base into the future to prevent an @@ -2018,7 +2020,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) - base->next_expiry = basej + NEXT_TIMER_MAX_DELTA; + WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA); return base->next_expiry; } @@ -2252,7 +2254,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, base_global, &tevt); /* - * If the next event is only one jiffie ahead there is no need to call + * If the next event is only one jiffy ahead there is no need to call * timer migration hierarchy related functions. The value for the next * global timer in @tevt struct equals then KTIME_MAX. This is also * true, when the timer base is idle. @@ -2411,7 +2413,7 @@ static inline void __run_timers(struct timer_base *base) * jiffies to avoid endless requeuing to current jiffies. */ base->clk++; - next_expiry_recalc(base); + timer_recalc_next_expiry(base); while (levels--) expire_timers(base, heads + levels); @@ -2440,7 +2442,7 @@ static void run_timer_base(int index) /* * This function runs timers and the timer-tq in bottom half context. */ -static __latent_entropy void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(void) { run_timer_base(BASE_LOCAL); if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { @@ -2462,8 +2464,40 @@ static void run_local_timers(void) hrtimer_run_queues(); for (int i = 0; i < NR_BASES; i++, base++) { - /* Raise the softirq only if required. */ - if (time_after_eq(jiffies, base->next_expiry) || + /* + * Raise the softirq only if required. + * + * timer_base::next_expiry can be written by a remote CPU while + * holding the lock. If this write happens at the same time than + * the lockless local read, sanity checker could complain about + * data corruption. + * + * There are two possible situations where + * timer_base::next_expiry is written by a remote CPU: + * + * 1. Remote CPU expires global timers of this CPU and updates + * timer_base::next_expiry of BASE_GLOBAL afterwards in + * next_timer_interrupt() or timer_recalc_next_expiry(). The + * worst outcome is a superfluous raise of the timer softirq + * when the not yet updated value is read. + * + * 2. A new first pinned timer is enqueued by a remote CPU + * and therefore timer_base::next_expiry of BASE_LOCAL is + * updated. When this update is missed, this isn't a + * problem, as an IPI is executed nevertheless when the CPU + * was idle before. When the CPU wasn't idle but the update + * is missed, then the timer would expire one jiffy late - + * bad luck. + * + * Those unlikely corner cases where the worst outcome is only a + * one jiffy delay or a superfluous raise of the softirq are + * not that expensive as doing the check always while holding + * the lock. + * + * Possible remote writers are using WRITE_ONCE(). Local reader + * uses therefore READ_ONCE(). + */ + if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) || (i == BASE_DEF && tmigr_requires_handle_remote())) { raise_softirq(TIMER_SOFTIRQ); return; @@ -2730,7 +2764,7 @@ void __init init_timers(void) */ void msleep(unsigned int msecs) { - unsigned long timeout = msecs_to_jiffies(msecs) + 1; + unsigned long timeout = msecs_to_jiffies(msecs); while (timeout) timeout = schedule_timeout_uninterruptible(timeout); @@ -2744,7 +2778,7 @@ EXPORT_SYMBOL(msleep); */ unsigned long msleep_interruptible(unsigned int msecs) { - unsigned long timeout = msecs_to_jiffies(msecs) + 1; + unsigned long timeout = msecs_to_jiffies(msecs); while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cd098846e251..ac0a01cc8634 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3160,6 +3160,7 @@ struct bpf_uprobe { loff_t offset; unsigned long ref_ctr_offset; u64 cookie; + struct uprobe *uprobe; struct uprobe_consumer consumer; }; @@ -3178,15 +3179,15 @@ struct bpf_uprobe_multi_run_ctx { struct bpf_uprobe *uprobe; }; -static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes, - u32 cnt) +static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt) { u32 i; - for (i = 0; i < cnt; i++) { - uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset, - &uprobes[i].consumer); - } + for (i = 0; i < cnt; i++) + uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer); + + if (cnt) + uprobe_unregister_sync(); } static void bpf_uprobe_multi_link_release(struct bpf_link *link) @@ -3194,7 +3195,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link) struct bpf_uprobe_multi_link *umulti_link; umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); - bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt); + bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt); if (umulti_link->task) put_task_struct(umulti_link->task); path_put(&umulti_link->path); @@ -3322,8 +3323,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe, } static bool -uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx, - struct mm_struct *mm) +uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm) { struct bpf_uprobe *uprobe; @@ -3480,22 +3480,26 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr &bpf_uprobe_multi_link_lops, prog); for (i = 0; i < cnt; i++) { - err = uprobe_register_refctr(d_real_inode(link->path.dentry), - uprobes[i].offset, - uprobes[i].ref_ctr_offset, - &uprobes[i].consumer); - if (err) { - bpf_uprobe_unregister(&path, uprobes, i); - goto error_free; + uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), + uprobes[i].offset, + uprobes[i].ref_ctr_offset, + &uprobes[i].consumer); + if (IS_ERR(uprobes[i].uprobe)) { + err = PTR_ERR(uprobes[i].uprobe); + link->cnt = i; + goto error_unregister; } } err = bpf_link_prime(&link->link, &link_primer); if (err) - goto error_free; + goto error_unregister; return bpf_link_settle(&link_primer); +error_unregister: + bpf_uprobe_unregister(uprobes, link->cnt); + error_free: kvfree(uprobes); kfree(link); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index d1d5ea2d0a1b..d7d4fb403f6f 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1206,18 +1206,24 @@ static void init_task_vars(int idx) read_unlock(&tasklist_lock); } -static void ftrace_graph_enable_direct(bool enable_branch) +static void ftrace_graph_enable_direct(bool enable_branch, struct fgraph_ops *gops) { trace_func_graph_ent_t func = NULL; trace_func_graph_ret_t retfunc = NULL; int i; - for_each_set_bit(i, &fgraph_array_bitmask, - sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) { - func = fgraph_array[i]->entryfunc; - retfunc = fgraph_array[i]->retfunc; - fgraph_direct_gops = fgraph_array[i]; - } + if (gops) { + func = gops->entryfunc; + retfunc = gops->retfunc; + fgraph_direct_gops = gops; + } else { + for_each_set_bit(i, &fgraph_array_bitmask, + sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) { + func = fgraph_array[i]->entryfunc; + retfunc = fgraph_array[i]->retfunc; + fgraph_direct_gops = fgraph_array[i]; + } + } if (WARN_ON_ONCE(!func)) return; @@ -1256,8 +1262,6 @@ int register_ftrace_graph(struct fgraph_ops *gops) ret = -ENOSPC; goto out; } - - fgraph_array[i] = gops; gops->idx = i; ftrace_graph_active++; @@ -1266,7 +1270,7 @@ int register_ftrace_graph(struct fgraph_ops *gops) ftrace_graph_disable_direct(true); if (ftrace_graph_active == 1) { - ftrace_graph_enable_direct(false); + ftrace_graph_enable_direct(false, gops); register_pm_notifier(&ftrace_suspend_notifier); ret = start_graph_tracing(); if (ret) @@ -1281,14 +1285,15 @@ int register_ftrace_graph(struct fgraph_ops *gops) } else { init_task_vars(gops->idx); } - /* Always save the function, and reset at unregistering */ gops->saved_func = gops->entryfunc; ret = ftrace_startup_subops(&graph_ops, &gops->ops, command); + if (!ret) + fgraph_array[i] = gops; + error: if (ret) { - fgraph_array[i] = &fgraph_stub; ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); @@ -1324,7 +1329,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) ftrace_shutdown_subops(&graph_ops, &gops->ops, command); if (ftrace_graph_active == 1) - ftrace_graph_enable_direct(true); + ftrace_graph_enable_direct(true, NULL); else if (!ftrace_graph_active) ftrace_graph_disable_direct(false); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebe7ce2f5f4a..c3b2c7dfadef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2226,10 +2226,6 @@ static __init int init_trace_selftests(void) } core_initcall(init_trace_selftests); #else -static inline int run_tracer_selftest(struct tracer *type) -{ - return 0; -} static inline int do_run_tracer_selftest(struct tracer *type) { return 0; @@ -3958,6 +3954,8 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) break; entries++; ring_buffer_iter_advance(buf_iter); + /* This could be a big loop */ + cond_resched(); } per_cpu_ptr(iter->array_buffer->data, cpu)->skipped_entries = entries; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 66a871553d4a..1439064f65d6 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -228,6 +228,11 @@ static inline struct osnoise_variables *this_cpu_osn_var(void) return this_cpu_ptr(&per_cpu_osnoise_var); } +/* + * Protect the interface. + */ +static struct mutex interface_lock; + #ifdef CONFIG_TIMERLAT_TRACER /* * Runtime information for the timer mode. @@ -259,14 +264,20 @@ static inline void tlat_var_reset(void) { struct timerlat_variables *tlat_var; int cpu; + + /* Synchronize with the timerlat interfaces */ + mutex_lock(&interface_lock); /* * So far, all the values are initialized as 0, so * zeroing the structure is perfect. */ for_each_cpu(cpu, cpu_online_mask) { tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); + if (tlat_var->kthread) + hrtimer_cancel(&tlat_var->timer); memset(tlat_var, 0, sizeof(*tlat_var)); } + mutex_unlock(&interface_lock); } #else /* CONFIG_TIMERLAT_TRACER */ #define tlat_var_reset() do {} while (0) @@ -332,11 +343,6 @@ struct timerlat_sample { #endif /* - * Protect the interface. - */ -static struct mutex interface_lock; - -/* * Tracer data. */ static struct osnoise_data { @@ -1535,7 +1541,7 @@ static int run_osnoise(void) * This will eventually cause unwarranted noise as PREEMPT_RCU * will force preemption as the means of ending the current * grace period. We avoid this problem by calling - * rcu_momentary_dyntick_idle(), which performs a zero duration + * rcu_momentary_eqs(), which performs a zero duration * EQS allowing PREEMPT_RCU to end the current grace period. * This call shouldn't be wrapped inside an RCU critical * section. @@ -1547,7 +1553,7 @@ static int run_osnoise(void) if (!disable_irq) local_irq_disable(); - rcu_momentary_dyntick_idle(); + rcu_momentary_eqs(); if (!disable_irq) local_irq_enable(); @@ -1612,6 +1618,7 @@ out: static struct cpumask osnoise_cpumask; static struct cpumask save_cpumask; +static struct cpumask kthread_cpumask; /* * osnoise_sleep - sleep until the next period @@ -1675,6 +1682,7 @@ static inline int osnoise_migration_pending(void) */ mutex_lock(&interface_lock); this_cpu_osn_var()->kthread = NULL; + cpumask_clear_cpu(smp_processor_id(), &kthread_cpumask); mutex_unlock(&interface_lock); return 1; @@ -1945,11 +1953,16 @@ static void stop_kthread(unsigned int cpu) { struct task_struct *kthread; + mutex_lock(&interface_lock); kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; if (kthread) { - if (test_bit(OSN_WORKLOAD, &osnoise_options)) { + per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + mutex_unlock(&interface_lock); + + if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask) && + !WARN_ON(!test_bit(OSN_WORKLOAD, &osnoise_options))) { kthread_stop(kthread); - } else { + } else if (!WARN_ON(test_bit(OSN_WORKLOAD, &osnoise_options))) { /* * This is a user thread waiting on the timerlat_fd. We need * to close all users, and the best way to guarantee this is @@ -1958,8 +1971,8 @@ static void stop_kthread(unsigned int cpu) kill_pid(kthread->thread_pid, SIGKILL, 1); put_task_struct(kthread); } - per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; } else { + mutex_unlock(&interface_lock); /* if no workload, just return */ if (!test_bit(OSN_WORKLOAD, &osnoise_options)) { /* @@ -1967,7 +1980,6 @@ static void stop_kthread(unsigned int cpu) */ per_cpu(per_cpu_osnoise_var, cpu).sampling = false; barrier(); - return; } } } @@ -1982,12 +1994,8 @@ static void stop_per_cpu_kthreads(void) { int cpu; - cpus_read_lock(); - - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) stop_kthread(cpu); - - cpus_read_unlock(); } /* @@ -2021,6 +2029,7 @@ static int start_kthread(unsigned int cpu) } per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; + cpumask_set_cpu(cpu, &kthread_cpumask); return 0; } @@ -2048,8 +2057,16 @@ static int start_per_cpu_kthreads(void) */ cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + if (cpumask_test_and_clear_cpu(cpu, &kthread_cpumask)) { + struct task_struct *kthread; + + kthread = per_cpu(per_cpu_osnoise_var, cpu).kthread; + if (!WARN_ON(!kthread)) + kthread_stop(kthread); + } per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL; + } for_each_cpu(cpu, current_mask) { retval = start_kthread(cpu); @@ -2579,7 +2596,8 @@ static int timerlat_fd_release(struct inode *inode, struct file *file) osn_var = per_cpu_ptr(&per_cpu_osnoise_var, cpu); tlat_var = per_cpu_ptr(&per_cpu_timerlat_var, cpu); - hrtimer_cancel(&tlat_var->timer); + if (tlat_var->kthread) + hrtimer_cancel(&tlat_var->timer); memset(tlat_var, 0, sizeof(*tlat_var)); osn_var->sampling = 0; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 130ca7e7787e..ae2ace5e515a 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -547,7 +547,7 @@ probe_wakeup(void *ignore, struct task_struct *p) * - wakeup_dl handles tasks belonging to sched_dl class only. */ if (tracing_dl || (wakeup_dl && !dl_task(p)) || - (wakeup_rt && !dl_task(p) && !rt_task(p)) || + (wakeup_rt && !rt_or_dl_task(p)) || (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) return; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 97f1e4bc47dc..c4ad7cd7e778 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -942,7 +942,7 @@ static __init int test_graph_storage_multi(void) { struct fgraph_fixture *fixture; bool printed = false; - int i, ret; + int i, j, ret; pr_cont("PASSED\n"); pr_info("Testing multiple fgraph storage on a function: "); @@ -953,22 +953,35 @@ static __init int test_graph_storage_multi(void) if (ret && ret != -ENODEV) { pr_cont("*Could not set filter* "); printed = true; - goto out; + goto out2; } + } + for (j = 0; j < ARRAY_SIZE(store_bytes); j++) { + fixture = &store_bytes[j]; ret = register_ftrace_graph(&fixture->gops); if (ret) { pr_warn("Failed to init store_bytes fgraph tracing\n"); printed = true; - goto out; + goto out1; } } DYN_FTRACE_TEST_NAME(); -out: +out1: + while (--j >= 0) { + fixture = &store_bytes[j]; + unregister_ftrace_graph(&fixture->gops); + + if (fixture->error_str && !printed) { + pr_cont("*** %s ***", fixture->error_str); + printed = true; + } + } +out2: while (--i >= 0) { fixture = &store_bytes[i]; - unregister_ftrace_graph(&fixture->gops); + ftrace_free_filter(&fixture->gops.ops); if (fixture->error_str && !printed) { pr_cont("*** %s ***", fixture->error_str); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c98e3b3386ba..f7443e996b1b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -58,8 +58,8 @@ struct trace_uprobe { struct dyn_event devent; struct uprobe_consumer consumer; struct path path; - struct inode *inode; char *filename; + struct uprobe *uprobe; unsigned long offset; unsigned long ref_ctr_offset; unsigned long nhit; @@ -1078,43 +1078,40 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e return trace_handle_return(s); } -typedef bool (*filter_func_t)(struct uprobe_consumer *self, - enum uprobe_filter_ctx ctx, - struct mm_struct *mm); +typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm); static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) { - int ret; + struct inode *inode = d_real_inode(tu->path.dentry); + struct uprobe *uprobe; tu->consumer.filter = filter; - tu->inode = d_real_inode(tu->path.dentry); - - if (tu->ref_ctr_offset) - ret = uprobe_register_refctr(tu->inode, tu->offset, - tu->ref_ctr_offset, &tu->consumer); - else - ret = uprobe_register(tu->inode, tu->offset, &tu->consumer); + uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer); + if (IS_ERR(uprobe)) + return PTR_ERR(uprobe); - if (ret) - tu->inode = NULL; - - return ret; + tu->uprobe = uprobe; + return 0; } static void __probe_event_disable(struct trace_probe *tp) { struct trace_uprobe *tu; + bool sync = false; tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - if (!tu->inode) + if (!tu->uprobe) continue; - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); - tu->inode = NULL; + uprobe_unregister_nosync(tu->uprobe, &tu->consumer); + sync = true; + tu->uprobe = NULL; } + if (sync) + uprobe_unregister_sync(); } static int probe_event_enable(struct trace_event_call *call, @@ -1310,7 +1307,7 @@ static int uprobe_perf_close(struct trace_event_call *call, return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); + ret = uprobe_apply(tu->uprobe, &tu->consumer, false); if (ret) break; } @@ -1334,7 +1331,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return 0; list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + err = uprobe_apply(tu->uprobe, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); break; @@ -1344,8 +1341,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return err; } -static bool uprobe_perf_filter(struct uprobe_consumer *uc, - enum uprobe_filter_ctx ctx, struct mm_struct *mm) +static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { struct trace_uprobe_filter *filter; struct trace_uprobe *tu; @@ -1431,7 +1427,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, struct uprobe_cpu_buffer **ucbp) { - if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + if (!uprobe_perf_filter(&tu->consumer, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) diff --git a/kernel/user.c b/kernel/user.c index aa1162deafe4..f46b1d41163b 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -36,33 +36,33 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc); */ struct user_namespace init_user_ns = { .uid_map = { - .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, + .nr_extents = 1, }, }, .gid_map = { - .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, + .nr_extents = 1, }, }, .projid_map = { - .nr_extents = 1, { .extent[0] = { .first = 0, .lower_first = 0, .count = 4294967295U, }, + .nr_extents = 1, }, }, .ns.count = REFCOUNT_INIT(3), diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index 8b4f8cc2e0ec..1fec61603ef3 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -198,17 +198,17 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_private); VMCOREINFO_NUMBER(PG_swapcache); VMCOREINFO_NUMBER(PG_swapbacked); -#define PAGE_SLAB_MAPCOUNT_VALUE (~PG_slab) +#define PAGE_SLAB_MAPCOUNT_VALUE (PGTY_slab << 24) VMCOREINFO_NUMBER(PAGE_SLAB_MAPCOUNT_VALUE); #ifdef CONFIG_MEMORY_FAILURE VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); -#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) +#define PAGE_BUDDY_MAPCOUNT_VALUE (PGTY_buddy << 24) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#define PAGE_HUGETLB_MAPCOUNT_VALUE (~PG_hugetlb) +#define PAGE_HUGETLB_MAPCOUNT_VALUE (PGTY_hugetlb << 24) VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); -#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) +#define PAGE_OFFLINE_MAPCOUNT_VALUE (PGTY_offline << 24) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #ifdef CONFIG_KALLSYMS diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e7b005ff3750..9949ffad8df0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -364,7 +364,8 @@ struct workqueue_struct { #ifdef CONFIG_LOCKDEP char *lock_name; struct lock_class_key key; - struct lockdep_map lockdep_map; + struct lockdep_map __lockdep_map; + struct lockdep_map *lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ @@ -476,16 +477,13 @@ static bool wq_debug_force_rr_cpu = false; module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* to raise softirq for the BH worker pools on other CPUs */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], - bh_pool_irq_works); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works); /* the BH worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - bh_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools); /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ @@ -2709,7 +2707,6 @@ static void detach_worker(struct worker *worker) unbind_worker(worker); list_del(&worker->node); - worker->pool = NULL; } /** @@ -2729,6 +2726,7 @@ static void worker_detach_from_pool(struct worker *worker) mutex_lock(&wq_pool_attach_mutex); detach_worker(worker); + worker->pool = NULL; mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ @@ -3203,7 +3201,7 @@ __acquires(&pool->lock) lockdep_start_depth = lockdep_depth(current); /* see drain_dead_softirq_workfn() */ if (!bh_draining) - lock_map_acquire(&pwq->wq->lockdep_map); + lock_map_acquire(pwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); /* * Strictly speaking we should mark the invariant state without holding @@ -3237,7 +3235,7 @@ __acquires(&pool->lock) pwq->stats[PWQ_STAT_COMPLETED]++; lock_map_release(&lockdep_map); if (!bh_draining) - lock_map_release(&pwq->wq->lockdep_map); + lock_map_release(pwq->wq->lockdep_map); if (unlikely((worker->task && in_atomic()) || lockdep_depth(current) != lockdep_start_depth || @@ -3349,7 +3347,11 @@ woke_up: if (unlikely(worker->flags & WORKER_DIE)) { raw_spin_unlock_irq(&pool->lock); set_pf_worker(false); - + /* + * The worker is dead and PF_WQ_WORKER is cleared, worker->pool + * shouldn't be accessed, reset it to NULL in case otherwise. + */ + worker->pool = NULL; ida_free(&pool->worker_ida, worker->id); return 0; } @@ -3869,11 +3871,14 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, static void touch_wq_lockdep_map(struct workqueue_struct *wq) { #ifdef CONFIG_LOCKDEP + if (unlikely(!wq->lockdep_map)) + return; + if (wq->flags & WQ_BH) local_bh_disable(); - lock_map_acquire(&wq->lockdep_map); - lock_map_release(&wq->lockdep_map); + lock_map_acquire(wq->lockdep_map); + lock_map_release(wq->lockdep_map); if (wq->flags & WQ_BH) local_bh_enable(); @@ -3907,7 +3912,7 @@ void __flush_workqueue(struct workqueue_struct *wq) struct wq_flusher this_flusher = { .list = LIST_HEAD_INIT(this_flusher.list), .flush_color = -1, - .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map), + .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)), }; int next_color; @@ -4772,16 +4777,23 @@ static void wq_init_lockdep(struct workqueue_struct *wq) lock_name = wq->name; wq->lock_name = lock_name; - lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0); + wq->lockdep_map = &wq->__lockdep_map; + lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0); } static void wq_unregister_lockdep(struct workqueue_struct *wq) { + if (wq->lockdep_map != &wq->__lockdep_map) + return; + lockdep_unregister_key(&wq->key); } static void wq_free_lockdep(struct workqueue_struct *wq) { + if (wq->lockdep_map != &wq->__lockdep_map) + return; + if (wq->lock_name != wq->name) kfree(wq->lock_name); } @@ -5615,12 +5627,10 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) } while (activated); } -__printf(1, 4) -struct workqueue_struct *alloc_workqueue(const char *fmt, - unsigned int flags, - int max_active, ...) +static struct workqueue_struct *__alloc_workqueue(const char *fmt, + unsigned int flags, + int max_active, va_list args) { - va_list args; struct workqueue_struct *wq; size_t wq_size; int name_len; @@ -5652,9 +5662,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, goto err_free_wq; } - va_start(args, max_active); name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args); - va_end(args); if (name_len >= WQ_NAME_LEN) pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n", @@ -5684,12 +5692,11 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, INIT_LIST_HEAD(&wq->flusher_overflow); INIT_LIST_HEAD(&wq->maydays); - wq_init_lockdep(wq); INIT_LIST_HEAD(&wq->list); if (flags & WQ_UNBOUND) { if (alloc_node_nr_active(wq->node_nr_active) < 0) - goto err_unreg_lockdep; + goto err_free_wq; } /* @@ -5728,9 +5735,6 @@ err_unlock_free_node_nr_active: kthread_flush_worker(pwq_release_worker); free_node_nr_active(wq->node_nr_active); } -err_unreg_lockdep: - wq_unregister_lockdep(wq); - wq_free_lockdep(wq); err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); @@ -5741,8 +5745,49 @@ err_destroy: destroy_workqueue(wq); return NULL; } + +__printf(1, 4) +struct workqueue_struct *alloc_workqueue(const char *fmt, + unsigned int flags, + int max_active, ...) +{ + struct workqueue_struct *wq; + va_list args; + + va_start(args, max_active); + wq = __alloc_workqueue(fmt, flags, max_active, args); + va_end(args); + if (!wq) + return NULL; + + wq_init_lockdep(wq); + + return wq; +} EXPORT_SYMBOL_GPL(alloc_workqueue); +#ifdef CONFIG_LOCKDEP +__printf(1, 5) +struct workqueue_struct * +alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, + int max_active, struct lockdep_map *lockdep_map, ...) +{ + struct workqueue_struct *wq; + va_list args; + + va_start(args, lockdep_map); + wq = __alloc_workqueue(fmt, flags, max_active, args); + va_end(args); + if (!wq) + return NULL; + + wq->lockdep_map = lockdep_map; + + return wq; +} +EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map); +#endif + static bool pwq_busy(struct pool_workqueue *pwq) { int i; @@ -7402,6 +7447,9 @@ static struct timer_list wq_watchdog_timer; static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; +static unsigned int wq_panic_on_stall; +module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644); + /* * Show workers that might prevent the processing of pending work items. * The only candidates are CPU-bound workers in the running state. @@ -7453,6 +7501,16 @@ static void show_cpu_pools_hogs(void) rcu_read_unlock(); } +static void panic_on_wq_watchdog(void) +{ + static unsigned int wq_stall; + + if (wq_panic_on_stall) { + wq_stall++; + BUG_ON(wq_stall >= wq_panic_on_stall); + } +} + static void wq_watchdog_reset_touched(void) { int cpu; @@ -7525,6 +7583,9 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (cpu_pool_stall) show_cpu_pools_hogs(); + if (lockup_detected) + panic_on_wq_watchdog(); + wq_watchdog_reset_touched(); mod_timer(&wq_watchdog_timer, jiffies + thresh); } |