summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec1
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_tree.c63
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bpf/bpf_lru_list.c9
-rw-r--r--kernel/bpf/bpf_lru_list.h1
-rw-r--r--kernel/bpf/btf.c2
-rw-r--r--kernel/bpf/cgroup.c2
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/helpers.c14
-rw-r--r--kernel/bpf/sysfs_btf.c4
-rw-r--r--kernel/bpf/verifier.c10
-rw-r--r--kernel/cgroup/legacy_freezer.c11
-rw-r--r--kernel/configs/hardening.config6
-rw-r--r--kernel/cpu.c130
-rw-r--r--kernel/dma/contiguous.c5
-rw-r--r--kernel/entry/Makefile3
-rw-r--r--kernel/entry/common.c113
-rw-r--r--kernel/entry/syscall-common.c112
-rw-r--r--kernel/entry/syscall_user_dispatch.c36
-rw-r--r--kernel/events/core.c142
-rw-r--r--kernel/events/ring_buffer.c4
-rw-r--r--kernel/exit.c17
-rw-r--r--kernel/fork.c45
-rw-r--r--kernel/freezer.c15
-rw-r--r--kernel/futex/core.c293
-rw-r--r--kernel/futex/futex.h2
-rw-r--r--kernel/irq/Kconfig11
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/affinity.c11
-rw-r--r--kernel/irq/chip.c78
-rw-r--r--kernel/irq/cpuhotplug.c7
-rw-r--r--kernel/irq/internals.h6
-rw-r--r--kernel/irq/irq_sim.c2
-rw-r--r--kernel/irq/irq_test.c229
-rw-r--r--kernel/irq/irqdomain.c1
-rw-r--r--kernel/irq/msi.c3
-rw-r--r--kernel/irq/pm.c16
-rw-r--r--kernel/irq/spurious.c37
-rw-r--r--kernel/kexec_core.c3
-rw-r--r--kernel/kexec_file.c10
-rw-r--r--kernel/kexec_handover.c33
-rw-r--r--kernel/kstack_erase.c (renamed from kernel/stackleak.c)22
-rw-r--r--kernel/locking/lockdep.c39
-rw-r--r--kernel/locking/lockdep_internals.h18
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/mutex-debug.c9
-rw-r--r--kernel/locking/mutex.c22
-rw-r--r--kernel/locking/mutex.h3
-rw-r--r--kernel/locking/rtmutex_api.c18
-rw-r--r--kernel/locking/rwsem.c4
-rw-r--r--kernel/locking/ww_mutex.h16
-rw-r--r--kernel/module/internal.h3
-rw-r--r--kernel/module/main.c53
-rw-r--r--kernel/module/sysfs.c14
-rw-r--r--kernel/panic.c60
-rw-r--r--kernel/pid.c33
-rw-r--r--kernel/power/console.c7
-rw-r--r--kernel/power/hibernate.c3
-rw-r--r--kernel/power/main.c9
-rw-r--r--kernel/power/power.h5
-rw-r--r--kernel/power/snapshot.c12
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/rcu/tree.c4
-rw-r--r--kernel/rcu/tree_stall.h33
-rw-r--r--kernel/resource.c5
-rw-r--r--kernel/sched/autogroup.c9
-rw-r--r--kernel/sched/autogroup.h6
-rw-r--r--kernel/sched/build_policy.c6
-rw-r--r--kernel/sched/build_utility.c9
-rw-r--r--kernel/sched/clock.c7
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c893
-rw-r--r--kernel/sched/core_sched.c2
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cpudeadline.c1
-rw-r--r--kernel/sched/cpudeadline.h4
-rw-r--r--kernel/sched/cpufreq.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c6
-rw-r--r--kernel/sched/cpupri.c1
-rw-r--r--kernel/sched/cpupri.h5
-rw-r--r--kernel/sched/cputime.c17
-rw-r--r--kernel/sched/deadline.c218
-rw-r--r--kernel/sched/debug.c51
-rw-r--r--kernel/sched/ext.c29
-rw-r--r--kernel/sched/ext.h2
-rw-r--r--kernel/sched/ext_idle.c2
-rw-r--r--kernel/sched/fair.c408
-rw-r--r--kernel/sched/idle.c15
-rw-r--r--kernel/sched/isolation.c2
-rw-r--r--kernel/sched/loadavg.c8
-rw-r--r--kernel/sched/membarrier.c2
-rw-r--r--kernel/sched/pelt.c5
-rw-r--r--kernel/sched/pelt.h67
-rw-r--r--kernel/sched/psi.c129
-rw-r--r--kernel/sched/rt.c112
-rw-r--r--kernel/sched/sched-pelt.h1
-rw-r--r--kernel/sched/sched.h245
-rw-r--r--kernel/sched/smp.h7
-rw-r--r--kernel/sched/stats.c5
-rw-r--r--kernel/sched/stats.h10
-rw-r--r--kernel/sched/stop_task.c5
-rw-r--r--kernel/sched/swait.c1
-rw-r--r--kernel/sched/syscalls.c15
-rw-r--r--kernel/sched/topology.c57
-rw-r--r--kernel/sched/wait.c1
-rw-r--r--kernel/sched/wait_bit.c3
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/smp.c70
-rw-r--r--kernel/smpboot.c4
-rw-r--r--kernel/stop_machine.c20
-rw-r--r--kernel/sys.c29
-rw-r--r--kernel/sysctl.c270
-rw-r--r--kernel/time/Kconfig15
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/time/namespace.c5
-rw-r--r--kernel/time/ntp.c72
-rw-r--r--kernel/time/ntp_internal.h13
-rw-r--r--kernel/time/posix-cpu-timers.c9
-rw-r--r--kernel/time/posix-timers.c3
-rw-r--r--kernel/time/posix-timers.h1
-rw-r--r--kernel/time/timecounter.c2
-rw-r--r--kernel/time/timekeeping.c631
-rw-r--r--kernel/time/timekeeping_internal.h3
-rw-r--r--kernel/time/timer_migration.c23
-rw-r--r--kernel/time/vsyscall.c70
-rw-r--r--kernel/trace/blktrace.c25
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_events_filter.c16
-rw-r--r--kernel/trace/trace_functions_graph.c6
-rw-r--r--kernel/trace/trace_osnoise.c2
-rw-r--r--kernel/trace/trace_probe.c2
-rw-r--r--kernel/watchdog_perf.c22
-rw-r--r--kernel/workqueue.c3
135 files changed, 3322 insertions, 2288 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index e64ce21f9a80..2ee603a98813 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -134,6 +134,7 @@ config CRASH_DM_CRYPT
depends on KEXEC_FILE
depends on CRASH_DUMP
depends on DM_CRYPT
+ depends on KEYS
help
With this option enabled, user space can intereact with
/sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys
diff --git a/kernel/Makefile b/kernel/Makefile
index 32e80dd626af..0ee9afd8b7cf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -139,11 +139,12 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
-CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
-obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
-KASAN_SANITIZE_stackleak.o := n
-KCSAN_SANITIZE_stackleak.o := n
-KCOV_INSTRUMENT_stackleak.o := n
+CFLAGS_kstack_erase.o += $(DISABLE_KSTACK_ERASE)
+CFLAGS_kstack_erase.o += $(call cc-option,-mgeneral-regs-only)
+obj-$(CONFIG_KSTACK_ERASE) += kstack_erase.o
+KASAN_SANITIZE_kstack_erase.o := n
+KCSAN_SANITIZE_kstack_erase.o := n
+KCOV_INSTRUMENT_kstack_erase.o := n
obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
diff --git a/kernel/audit.h b/kernel/audit.h
index 0211cb307d30..2a24d01c5fb0 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -200,7 +200,7 @@ struct audit_context {
int argc;
} execve;
struct {
- char *name;
+ const char *name;
} module;
struct {
struct audit_ntp_data ntp_data;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f2f38903b2fe..b0eae2a3c895 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -668,12 +668,6 @@ int audit_remove_tree_rule(struct audit_krule *rule)
return 0;
}
-static int compare_root(struct vfsmount *mnt, void *arg)
-{
- return inode_to_key(d_backing_inode(mnt->mnt_root)) ==
- (unsigned long)arg;
-}
-
void audit_trim_trees(void)
{
struct list_head cursor;
@@ -683,8 +677,9 @@ void audit_trim_trees(void)
while (cursor.next != &tree_list) {
struct audit_tree *tree;
struct path path;
- struct vfsmount *root_mnt;
struct audit_node *node;
+ struct path *paths;
+ struct path array[16];
int err;
tree = container_of(cursor.next, struct audit_tree, list);
@@ -696,9 +691,9 @@ void audit_trim_trees(void)
if (err)
goto skip_it;
- root_mnt = collect_mounts(&path);
+ paths = collect_paths(&path, array, 16);
path_put(&path);
- if (IS_ERR(root_mnt))
+ if (IS_ERR(paths))
goto skip_it;
spin_lock(&hash_lock);
@@ -706,14 +701,17 @@ void audit_trim_trees(void)
struct audit_chunk *chunk = find_chunk(node);
/* this could be NULL if the watch is dying else where... */
node->index |= 1U<<31;
- if (iterate_mounts(compare_root,
- (void *)(chunk->key),
- root_mnt))
- node->index &= ~(1U<<31);
+ for (struct path *p = paths; p->dentry; p++) {
+ struct inode *inode = p->dentry->d_inode;
+ if (inode_to_key(inode) == chunk->key) {
+ node->index &= ~(1U<<31);
+ break;
+ }
+ }
}
spin_unlock(&hash_lock);
trim_marked(tree);
- drop_collected_mounts(root_mnt);
+ drop_collected_paths(paths, array);
skip_it:
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -742,9 +740,14 @@ void audit_put_tree(struct audit_tree *tree)
put_tree(tree);
}
-static int tag_mount(struct vfsmount *mnt, void *arg)
+static int tag_mounts(struct path *paths, struct audit_tree *tree)
{
- return tag_chunk(d_backing_inode(mnt->mnt_root), arg);
+ for (struct path *p = paths; p->dentry; p++) {
+ int err = tag_chunk(p->dentry->d_inode, tree);
+ if (err)
+ return err;
+ }
+ return 0;
}
/*
@@ -801,7 +804,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
{
struct audit_tree *seed = rule->tree, *tree;
struct path path;
- struct vfsmount *mnt;
+ struct path array[16];
+ struct path *paths;
int err;
rule->tree = NULL;
@@ -828,16 +832,16 @@ int audit_add_tree_rule(struct audit_krule *rule)
err = kern_path(tree->pathname, 0, &path);
if (err)
goto Err;
- mnt = collect_mounts(&path);
+ paths = collect_paths(&path, array, 16);
path_put(&path);
- if (IS_ERR(mnt)) {
- err = PTR_ERR(mnt);
+ if (IS_ERR(paths)) {
+ err = PTR_ERR(paths);
goto Err;
}
get_tree(tree);
- err = iterate_mounts(tag_mount, tree, mnt);
- drop_collected_mounts(mnt);
+ err = tag_mounts(paths, tree);
+ drop_collected_paths(paths, array);
if (!err) {
struct audit_node *node;
@@ -872,20 +876,21 @@ int audit_tag_tree(char *old, char *new)
struct list_head cursor, barrier;
int failed = 0;
struct path path1, path2;
- struct vfsmount *tagged;
+ struct path array[16];
+ struct path *paths;
int err;
err = kern_path(new, 0, &path2);
if (err)
return err;
- tagged = collect_mounts(&path2);
+ paths = collect_paths(&path2, array, 16);
path_put(&path2);
- if (IS_ERR(tagged))
- return PTR_ERR(tagged);
+ if (IS_ERR(paths))
+ return PTR_ERR(paths);
err = kern_path(old, 0, &path1);
if (err) {
- drop_collected_mounts(tagged);
+ drop_collected_paths(paths, array);
return err;
}
@@ -914,7 +919,7 @@ int audit_tag_tree(char *old, char *new)
continue;
}
- failed = iterate_mounts(tag_mount, tree, tagged);
+ failed = tag_mounts(paths, tree);
if (failed) {
put_tree(tree);
mutex_lock(&audit_filter_mutex);
@@ -955,7 +960,7 @@ int audit_tag_tree(char *old, char *new)
list_del(&cursor);
mutex_unlock(&audit_filter_mutex);
path_put(&path1);
- drop_collected_mounts(tagged);
+ drop_collected_paths(paths, array);
return failed;
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 78fd876a5473..eb98cd6fe91f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2864,7 +2864,7 @@ void __audit_openat2_how(struct open_how *how)
context->type = AUDIT_OPENAT2;
}
-void __audit_log_kern_module(char *name)
+void __audit_log_kern_module(const char *name)
{
struct audit_context *context = audit_context();
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 3dabdd137d10..2d6e1c98d8ad 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
list) {
__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
- if (++nfree == LOCAL_FREE_TARGET)
+ if (++nfree == lru->target_free)
break;
}
- if (nfree < LOCAL_FREE_TARGET)
- __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
+ if (nfree < lru->target_free)
+ __bpf_lru_list_shrink(lru, l, lru->target_free - nfree,
local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
@@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
}
+
+ lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2,
+ 1, LOCAL_FREE_TARGET);
}
static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index cbd8d3720c2b..fe2661a58ea9 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -58,6 +58,7 @@ struct bpf_lru {
del_from_htab_func del_from_htab;
void *del_arg;
unsigned int hash_offset;
+ unsigned int target_free;
unsigned int nr_scans;
bool percpu;
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1d2cf898e21e..e8e63bd025c7 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -8183,7 +8183,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
attr->attr.mode = 0444;
attr->size = btf->data_size;
attr->private = btf->data;
- attr->read_new = sysfs_bin_attr_simple_read;
+ attr->read = sysfs_bin_attr_simple_read;
err = sysfs_create_bin_file(btf_kobj, attr);
if (err) {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9122c39870bf..f4885514f007 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -2134,7 +2134,7 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c20babbf998f..dae281a1286d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -304,7 +304,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
if (!raw)
return -ENOMEM;
- sha1_init(digest);
+ sha1_init_raw(digest);
memset(ws, 0, sizeof(ws));
/* We need to take out the map fd for the digest calculation
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index b71e428ad936..88035dae38c1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -884,6 +884,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
if (fmt[i] == 'p') {
sizeof_cur_arg = sizeof(long);
+ if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
+ ispunct(fmt[i + 1])) {
+ if (tmp_buf)
+ cur_arg = raw_args[num_spec];
+ goto nocopy_fmt;
+ }
+
if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') &&
fmt[i + 2] == 's') {
fmt_ptype = fmt[i + 1];
@@ -891,11 +898,9 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
goto fmt_str;
}
- if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) ||
- ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' ||
+ if (fmt[i + 1] == 'K' ||
fmt[i + 1] == 'x' || fmt[i + 1] == 's' ||
fmt[i + 1] == 'S') {
- /* just kernel pointers */
if (tmp_buf)
cur_arg = raw_args[num_spec];
i++;
@@ -3397,6 +3402,9 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPAB
BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
#endif
BTF_ID_FLAGS(func, __bpf_trap)
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU)
+#endif
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index 941d0d2427e3..9cbe15ce3540 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -21,7 +21,7 @@ static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
{
unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
size_t vm_size = vma->vm_end - vma->vm_start;
- phys_addr_t addr = virt_to_phys(__start_BTF);
+ phys_addr_t addr = __pa_symbol(__start_BTF);
unsigned long pfn = addr >> PAGE_SHIFT;
if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
@@ -45,7 +45,7 @@ static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
.attr = { .name = "vmlinux", .mode = 0444, },
- .read_new = sysfs_bin_attr_simple_read,
+ .read = sysfs_bin_attr_simple_read,
.mmap = btf_sysfs_vmlinux_mmap,
};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a7d6e0c5928b..a0d663be91a2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6993,6 +6993,10 @@ BTF_TYPE_SAFE_RCU(struct css_set) {
struct cgroup *dfl_cgrp;
};
+BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) {
+ struct cgroup *cgroup;
+};
+
/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
struct file __rcu *exe_file;
@@ -7027,8 +7031,7 @@ BTF_TYPE_SAFE_TRUSTED(struct file) {
struct inode *f_inode;
};
-BTF_TYPE_SAFE_TRUSTED(struct dentry) {
- /* no negative dentry-s in places where bpf can see it */
+BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) {
struct inode *d_inode;
};
@@ -7043,6 +7046,7 @@ static bool type_is_rcu(struct bpf_verifier_env *env,
BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
}
@@ -7066,7 +7070,6 @@ static bool type_is_trusted(struct bpf_verifier_env *env,
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
- BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
}
@@ -7076,6 +7079,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env,
const char *field_name, u32 btf_id)
{
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry));
return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id,
"__safe_trusted_or_null");
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 039d1eb2f215..dd9417425d92 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -66,15 +66,9 @@ static struct freezer *parent_freezer(struct freezer *freezer)
bool cgroup_freezing(struct task_struct *task)
{
bool ret;
- unsigned int state;
rcu_read_lock();
- /* Check if the cgroup is still FREEZING, but not FROZEN. The extra
- * !FROZEN check is required, because the FREEZING bit is not cleared
- * when the state FROZEN is reached.
- */
- state = task_freezer(task)->state;
- ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN);
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
rcu_read_unlock();
return ret;
@@ -188,13 +182,12 @@ static void freezer_attach(struct cgroup_taskset *tset)
if (!(freezer->state & CGROUP_FREEZING)) {
__thaw_task(task);
} else {
- freeze_task(task);
-
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
freezer = parent_freezer(freezer);
}
+ freeze_task(task);
}
}
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index dd7c32fb5ac1..64caaf997fc0 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -60,9 +60,15 @@ CONFIG_LIST_HARDENED=y
# Initialize all heap variables to zero on allocation.
CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y
+# Initialize all heap variables to zero on free to reduce stale data lifetime.
+CONFIG_INIT_ON_FREE_DEFAULT_ON=y
+
# Initialize all stack variables to zero on function entry.
CONFIG_INIT_STACK_ALL_ZERO=y
+# Wipe kernel stack after syscall completion to reduce stale data lifetime.
+CONFIG_KSTACK_ERASE=y
+
# Wipe RAM at reboot via EFI. For more details, see:
# https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/
# https://bugzilla.redhat.com/show_bug.cgi?id=1532058
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a59e009e0be4..faf0f23fc5d8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -37,6 +37,7 @@
#include <linux/cpuset.h>
#include <linux/random.h>
#include <linux/cc_platform.h>
+#include <linux/parser.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -3174,8 +3175,38 @@ void __init boot_cpu_hotplug_init(void)
#ifdef CONFIG_CPU_MITIGATIONS
/*
- * These are used for a global "mitigations=" cmdline option for toggling
- * optional CPU mitigations.
+ * All except the cross-thread attack vector are mitigated by default.
+ * Cross-thread mitigation often requires disabling SMT which is expensive
+ * so cross-thread mitigations are only partially enabled by default.
+ *
+ * Guest-to-Host and Guest-to-Guest vectors are only needed if KVM support is
+ * present.
+ */
+static bool attack_vectors[NR_CPU_ATTACK_VECTORS] __ro_after_init = {
+ [CPU_MITIGATE_USER_KERNEL] = true,
+ [CPU_MITIGATE_USER_USER] = true,
+ [CPU_MITIGATE_GUEST_HOST] = IS_ENABLED(CONFIG_KVM),
+ [CPU_MITIGATE_GUEST_GUEST] = IS_ENABLED(CONFIG_KVM),
+};
+
+bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v)
+{
+ if (v < NR_CPU_ATTACK_VECTORS)
+ return attack_vectors[v];
+
+ WARN_ONCE(1, "Invalid attack vector %d\n", v);
+ return false;
+}
+
+/*
+ * There are 3 global options, 'off', 'auto', 'auto,nosmt'. These may optionally
+ * be combined with attack-vector disables which follow them.
+ *
+ * Examples:
+ * mitigations=auto,no_user_kernel,no_user_user,no_cross_thread
+ * mitigations=auto,nosmt,no_guest_host,no_guest_guest
+ *
+ * mitigations=off is equivalent to disabling all attack vectors.
*/
enum cpu_mitigations {
CPU_MITIGATIONS_OFF,
@@ -3183,19 +3214,96 @@ enum cpu_mitigations {
CPU_MITIGATIONS_AUTO_NOSMT,
};
+enum {
+ NO_USER_KERNEL,
+ NO_USER_USER,
+ NO_GUEST_HOST,
+ NO_GUEST_GUEST,
+ NO_CROSS_THREAD,
+ NR_VECTOR_PARAMS,
+};
+
+enum smt_mitigations smt_mitigations __ro_after_init = SMT_MITIGATIONS_AUTO;
static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
+static const match_table_t global_mitigations = {
+ { CPU_MITIGATIONS_AUTO_NOSMT, "auto,nosmt"},
+ { CPU_MITIGATIONS_AUTO, "auto"},
+ { CPU_MITIGATIONS_OFF, "off"},
+};
+
+static const match_table_t vector_mitigations = {
+ { NO_USER_KERNEL, "no_user_kernel"},
+ { NO_USER_USER, "no_user_user"},
+ { NO_GUEST_HOST, "no_guest_host"},
+ { NO_GUEST_GUEST, "no_guest_guest"},
+ { NO_CROSS_THREAD, "no_cross_thread"},
+ { NR_VECTOR_PARAMS, NULL},
+};
+
+static int __init mitigations_parse_global_opt(char *arg)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(global_mitigations); i++) {
+ const char *pattern = global_mitigations[i].pattern;
+
+ if (!strncmp(arg, pattern, strlen(pattern))) {
+ cpu_mitigations = global_mitigations[i].token;
+ return strlen(pattern);
+ }
+ }
+
+ return 0;
+}
+
static int __init mitigations_parse_cmdline(char *arg)
{
- if (!strcmp(arg, "off"))
- cpu_mitigations = CPU_MITIGATIONS_OFF;
- else if (!strcmp(arg, "auto"))
- cpu_mitigations = CPU_MITIGATIONS_AUTO;
- else if (!strcmp(arg, "auto,nosmt"))
- cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
- else
- pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
- arg);
+ char *s, *p;
+ int len;
+
+ len = mitigations_parse_global_opt(arg);
+
+ if (cpu_mitigations_off()) {
+ memset(attack_vectors, 0, sizeof(attack_vectors));
+ smt_mitigations = SMT_MITIGATIONS_OFF;
+ } else if (cpu_mitigations_auto_nosmt()) {
+ smt_mitigations = SMT_MITIGATIONS_ON;
+ }
+
+ p = arg + len;
+
+ if (!*p)
+ return 0;
+
+ /* Attack vector controls may come after the ',' */
+ if (*p++ != ',' || !IS_ENABLED(CONFIG_ARCH_HAS_CPU_ATTACK_VECTORS)) {
+ pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", arg);
+ return 0;
+ }
+
+ while ((s = strsep(&p, ",")) != NULL) {
+ switch (match_token(s, vector_mitigations, NULL)) {
+ case NO_USER_KERNEL:
+ attack_vectors[CPU_MITIGATE_USER_KERNEL] = false;
+ break;
+ case NO_USER_USER:
+ attack_vectors[CPU_MITIGATE_USER_USER] = false;
+ break;
+ case NO_GUEST_HOST:
+ attack_vectors[CPU_MITIGATE_GUEST_HOST] = false;
+ break;
+ case NO_GUEST_GUEST:
+ attack_vectors[CPU_MITIGATE_GUEST_GUEST] = false;
+ break;
+ case NO_CROSS_THREAD:
+ smt_mitigations = SMT_MITIGATIONS_OFF;
+ break;
+ default:
+ pr_crit("Unsupported mitigations options %s\n", s);
+ return 0;
+ }
+ }
return 0;
}
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 8df0dfaaca18..67af8a55185d 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -222,7 +222,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
if (size_cmdline != -1) {
selected_size = size_cmdline;
selected_base = base_cmdline;
- selected_limit = min_not_zero(limit_cmdline, limit);
+
+ /* Hornor the user setup dma address limit */
+ selected_limit = limit_cmdline ?: limit;
+
if (base_cmdline + size_cmdline == limit_cmdline)
fixed = true;
} else {
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
index d4b8bd0af79b..77fcd83dd663 100644
--- a/kernel/entry/Makefile
+++ b/kernel/entry/Makefile
@@ -12,5 +12,6 @@ ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
CFLAGS_common.o += -fno-stack-protector
-obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o
+obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o
+obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o
obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index a8dd1f27417c..b82032777310 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -1,84 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/context_tracking.h>
-#include <linux/entry-common.h>
+#include <linux/irq-entry-common.h>
#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
#include <linux/jump_label.h>
#include <linux/kmsan.h>
#include <linux/livepatch.h>
-#include <linux/audit.h>
#include <linux/tick.h>
-#include "common.h"
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
-static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
-{
- if (unlikely(audit_context())) {
- unsigned long args[6];
-
- syscall_get_arguments(current, regs, args);
- audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
- }
-}
-
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long work)
-{
- long ret = 0;
-
- /*
- * Handle Syscall User Dispatch. This must comes first, since
- * the ABI here can be something that doesn't make sense for
- * other syscall_work features.
- */
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (syscall_user_dispatch(regs))
- return -1L;
- }
-
- /* Handle ptrace */
- if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = ptrace_report_syscall_entry(regs);
- if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
- return -1L;
- }
-
- /* Do seccomp after ptrace, to catch any tracer changes. */
- if (work & SYSCALL_WORK_SECCOMP) {
- ret = __secure_computing();
- if (ret == -1L)
- return ret;
- }
-
- /* Either of the above might have changed the syscall number */
- syscall = syscall_get_nr(current, regs);
-
- if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
- trace_sys_enter(regs, syscall);
- /*
- * Probes or BPF hooks in the tracepoint may have changed the
- * system call number as well.
- */
- syscall = syscall_get_nr(current, regs);
- }
-
- syscall_enter_audit(regs, syscall);
-
- return ret ? : syscall;
-}
-
-noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
-{
- enter_from_user_mode(regs);
- instrumentation_begin();
- local_irq_enable();
- instrumentation_end();
-}
-
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
@@ -133,46 +62,6 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
return ti_work;
}
-/*
- * If SYSCALL_EMU is set, then the only reason to report is when
- * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
- * instruction has been already reported in syscall_enter_from_user_mode().
- */
-static inline bool report_single_step(unsigned long work)
-{
- if (work & SYSCALL_WORK_SYSCALL_EMU)
- return false;
-
- return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
-}
-
-void syscall_exit_work(struct pt_regs *regs, unsigned long work)
-{
- bool step;
-
- /*
- * If the syscall was rolled back due to syscall user dispatching,
- * then the tracers below are not invoked for the same reason as
- * the entry side was not invoked in syscall_trace_enter(): The ABI
- * of these syscalls is unknown.
- */
- if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
- if (unlikely(current->syscall_dispatch.on_dispatch)) {
- current->syscall_dispatch.on_dispatch = false;
- return;
- }
- }
-
- audit_syscall_exit(regs);
-
- if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
- trace_sys_exit(regs, syscall_get_return_value(current, regs));
-
- step = report_single_step(work);
- if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- ptrace_report_syscall_exit(regs, step);
-}
-
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
enter_from_user_mode(regs);
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
new file mode 100644
index 000000000000..66e6ba7fa80c
--- /dev/null
+++ b/kernel/entry/syscall-common.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/audit.h>
+#include <linux/entry-common.h>
+#include "common.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+ if (unlikely(audit_context())) {
+ unsigned long args[6];
+
+ syscall_get_arguments(current, regs, args);
+ audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+ }
+}
+
+long syscall_trace_enter(struct pt_regs *regs, long syscall,
+ unsigned long work)
+{
+ long ret = 0;
+
+ /*
+ * Handle Syscall User Dispatch. This must comes first, since
+ * the ABI here can be something that doesn't make sense for
+ * other syscall_work features.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (syscall_user_dispatch(regs))
+ return -1L;
+ }
+
+ /* Handle ptrace */
+ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
+ ret = ptrace_report_syscall_entry(regs);
+ if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
+ return -1L;
+ }
+
+ /* Do seccomp after ptrace, to catch any tracer changes. */
+ if (work & SYSCALL_WORK_SECCOMP) {
+ ret = __secure_computing();
+ if (ret == -1L)
+ return ret;
+ }
+
+ /* Either of the above might have changed the syscall number */
+ syscall = syscall_get_nr(current, regs);
+
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) {
+ trace_sys_enter(regs, syscall);
+ /*
+ * Probes or BPF hooks in the tracepoint may have changed the
+ * system call number as well.
+ */
+ syscall = syscall_get_nr(current, regs);
+ }
+
+ syscall_enter_audit(regs, syscall);
+
+ return ret ? : syscall;
+}
+
+noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+ instrumentation_begin();
+ local_irq_enable();
+ instrumentation_end();
+}
+
+/*
+ * If SYSCALL_EMU is set, then the only reason to report is when
+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
+ * instruction has been already reported in syscall_enter_from_user_mode().
+ */
+static inline bool report_single_step(unsigned long work)
+{
+ if (work & SYSCALL_WORK_SYSCALL_EMU)
+ return false;
+
+ return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
+}
+
+void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+{
+ bool step;
+
+ /*
+ * If the syscall was rolled back due to syscall user dispatching,
+ * then the tracers below are not invoked for the same reason as
+ * the entry side was not invoked in syscall_trace_enter(): The ABI
+ * of these syscalls is unknown.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (unlikely(current->syscall_dispatch.on_dispatch)) {
+ current->syscall_dispatch.on_dispatch = false;
+ return;
+ }
+ }
+
+ audit_syscall_exit(regs);
+
+ if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, syscall_get_return_value(current, regs));
+
+ step = report_single_step(work);
+ if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
+ ptrace_report_syscall_exit(regs, step);
+}
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 5340c5aa89e7..a9055eccb27e 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -78,7 +78,7 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon
if (offset || len || selector)
return -EINVAL;
break;
- case PR_SYS_DISPATCH_ON:
+ case PR_SYS_DISPATCH_EXCLUSIVE_ON:
/*
* Validate the direct dispatcher region just for basic
* sanity against overflow and a 0-sized dispatcher
@@ -87,30 +87,40 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon
*/
if (offset && offset + len <= offset)
return -EINVAL;
-
+ break;
+ case PR_SYS_DISPATCH_INCLUSIVE_ON:
+ if (len == 0 || offset + len <= offset)
+ return -EINVAL;
/*
- * access_ok() will clear memory tags for tagged addresses
- * if current has memory tagging enabled.
-
- * To enable a tracer to set a tracees selector the
- * selector address must be untagged for access_ok(),
- * otherwise an untagged tracer will always fail to set a
- * tagged tracees selector.
+ * Invert the range, the check in syscall_user_dispatch()
+ * supports wrap-around.
*/
- if (selector && !access_ok(untagged_addr(selector), sizeof(*selector)))
- return -EFAULT;
-
+ offset = offset + len;
+ len = -len;
break;
default:
return -EINVAL;
}
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+ *
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (mode != PR_SYS_DISPATCH_OFF && selector &&
+ !access_ok(untagged_addr(selector), sizeof(*selector)))
+ return -EFAULT;
+
task->syscall_dispatch.selector = selector;
task->syscall_dispatch.offset = offset;
task->syscall_dispatch.len = len;
task->syscall_dispatch.on_dispatch = false;
- if (mode == PR_SYS_DISPATCH_ON)
+ if (mode != PR_SYS_DISPATCH_OFF)
set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
else
clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f34c99f8ce8f..22fdf0c187cd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -207,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
__perf_ctx_unlock(&cpuctx->ctx);
}
+typedef struct {
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+} class_perf_ctx_lock_t;
+
+static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T)
+{ perf_ctx_unlock(_T->cpuctx, _T->ctx); }
+
+static inline class_perf_ctx_lock_t
+class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; }
+
#define TASK_TOMBSTONE ((void *)-1L)
static bool is_kernel_event(struct perf_event *event)
@@ -938,13 +951,19 @@ static void perf_cgroup_switch(struct task_struct *task)
if (READ_ONCE(cpuctx->cgrp) == NULL)
return;
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
-
cgrp = perf_cgroup_from_task(task, NULL);
if (READ_ONCE(cpuctx->cgrp) == cgrp)
return;
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx);
+ /*
+ * Re-check, could've raced vs perf_remove_from_context().
+ */
+ if (READ_ONCE(cpuctx->cgrp) == NULL)
+ return;
+
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+
perf_ctx_disable(&cpuctx->ctx, true);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
@@ -962,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task)
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -2120,18 +2138,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader == event)
del_event_from_groups(event, ctx);
- /*
- * If event was in error state, then keep it
- * that way, otherwise bogus counts will be
- * returned on read(). The only way to get out
- * of error state is by explicit re-enabling
- * of the event
- */
- if (event->state > PERF_EVENT_STATE_OFF) {
- perf_cgroup_event_disable(event, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_OFF);
- }
-
ctx->generation++;
event->pmu_ctx->nr_events--;
}
@@ -2149,8 +2155,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
}
static void put_event(struct perf_event *event);
-static void event_sched_out(struct perf_event *event,
- struct perf_event_context *ctx);
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state);
static void perf_put_aux_event(struct perf_event *event)
{
@@ -2183,8 +2190,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR);
}
}
@@ -2242,18 +2248,6 @@ static inline struct list_head *get_event_list(struct perf_event *event)
&event->pmu_ctx->flexible_active;
}
-/*
- * Events that have PERF_EV_CAP_SIBLING require being part of a group and
- * cannot exist on their own, schedule them out and move them into the ERROR
- * state. Also see _perf_event_enable(), it will not be able to recover
- * this ERROR state.
- */
-static inline void perf_remove_sibling_event(struct perf_event *event)
-{
- event_sched_out(event, event->ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
-}
-
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *leader = event->group_leader;
@@ -2289,8 +2283,15 @@ static void perf_group_detach(struct perf_event *event)
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ /*
+ * Events that have PERF_EV_CAP_SIBLING require being part of
+ * a group and cannot exist on their own, schedule them out
+ * and move them into the ERROR state. Also see
+ * _perf_event_enable(), it will not be able to recover this
+ * ERROR state.
+ */
if (sibling->event_caps & PERF_EV_CAP_SIBLING)
- perf_remove_sibling_event(sibling);
+ __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR);
sibling->group_leader = sibling;
list_del_init(&sibling->sibling_list);
@@ -2493,11 +2494,14 @@ __perf_remove_from_context(struct perf_event *event,
state = PERF_EVENT_STATE_EXIT;
if (flags & DETACH_REVOKE)
state = PERF_EVENT_STATE_REVOKED;
- if (flags & DETACH_DEAD) {
- event->pending_disable = 1;
+ if (flags & DETACH_DEAD)
state = PERF_EVENT_STATE_DEAD;
- }
+
event_sched_out(event, ctx);
+
+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_disable(event, ctx);
+
perf_event_set_state(event, min(event->state, state));
if (flags & DETACH_GROUP)
@@ -2562,6 +2566,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
event_function_call(event, __perf_remove_from_context, (void *)flags);
}
+static void __event_disable(struct perf_event *event,
+ struct perf_event_context *ctx,
+ enum perf_event_state state)
+{
+ event_sched_out(event, ctx);
+ perf_cgroup_event_disable(event, ctx);
+ perf_event_set_state(event, state);
+}
+
/*
* Cross CPU call to disable a performance event
*/
@@ -2576,13 +2589,18 @@ static void __perf_event_disable(struct perf_event *event,
perf_pmu_disable(event->pmu_ctx->pmu);
ctx_time_update_event(ctx, event);
+ /*
+ * When disabling a group leader, the whole group becomes ineligible
+ * to run, so schedule out the full group.
+ */
if (event == event->group_leader)
group_sched_out(event, ctx);
- else
- event_sched_out(event, ctx);
- perf_event_set_state(event, PERF_EVENT_STATE_OFF);
- perf_cgroup_event_disable(event, ctx);
+ /*
+ * But only mark the leader OFF; the siblings will remain
+ * INACTIVE.
+ */
+ __event_disable(event, ctx, PERF_EVENT_STATE_OFF);
perf_pmu_enable(event->pmu_ctx->pmu);
}
@@ -2656,8 +2674,8 @@ static void perf_event_unthrottle(struct perf_event *event, bool start)
static void perf_event_throttle(struct perf_event *event)
{
- event->pmu->stop(event, 0);
event->hw.interrupts = MAX_INTERRUPTS;
+ event->pmu->stop(event, 0);
if (event == event->group_leader)
perf_log_throttle(event, 0);
}
@@ -7186,18 +7204,18 @@ void perf_event_wakeup(struct perf_event *event)
static void perf_sigtrap(struct perf_event *event)
{
/*
- * We'd expect this to only occur if the irq_work is delayed and either
- * ctx->task or current has changed in the meantime. This can be the
- * case on architectures that do not implement arch_irq_work_raise().
+ * Both perf_pending_task() and perf_pending_irq() can race with the
+ * task exiting.
*/
- if (WARN_ON_ONCE(event->ctx->task != current))
+ if (current->flags & PF_EXITING)
return;
/*
- * Both perf_pending_task() and perf_pending_irq() can race with the
- * task exiting.
+ * We'd expect this to only occur if the irq_work is delayed and either
+ * ctx->task or current has changed in the meantime. This can be the
+ * case on architectures that do not implement arch_irq_work_raise().
*/
- if (current->flags & PF_EXITING)
+ if (WARN_ON_ONCE(event->ctx->task != current))
return;
send_sig_perf((void __user *)event->pending_addr,
@@ -7233,15 +7251,15 @@ static void __perf_pending_disable(struct perf_event *event)
* CPU-A CPU-B
*
* perf_event_disable_inatomic()
- * @pending_disable = CPU-A;
+ * @pending_disable = 1;
* irq_work_queue();
*
* sched-out
- * @pending_disable = -1;
+ * @pending_disable = 0;
*
* sched-in
* perf_event_disable_inatomic()
- * @pending_disable = CPU-B;
+ * @pending_disable = 1;
* irq_work_queue(); // FAILS
*
* irq_work_run()
@@ -7439,6 +7457,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size,
if (!regs)
return 0;
+ /* No mm, no stack, no dump. */
+ if (!current->mm)
+ return 0;
+
/*
* Check if we fit in with the requested stack size into the:
* - TASK_SIZE
@@ -8150,6 +8172,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
const u32 max_stack = event->attr.sample_max_stack;
struct perf_callchain_entry *callchain;
+ if (!current->mm)
+ user = false;
+
if (!kernel && !user)
return &__empty_callchain;
@@ -11091,7 +11116,7 @@ static int perf_uprobe_event_init(struct perf_event *event)
if (event->attr.type != perf_uprobe.type)
return -ENOENT;
- if (!perfmon_capable())
+ if (!capable(CAP_SYS_ADMIN))
return -EACCES;
/*
@@ -11749,7 +11774,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (is_sampling_event(event)) {
+ /*
+ * The throttle can be triggered in the hrtimer handler.
+ * The HRTIMER_NORESTART should be used to stop the timer,
+ * rather than hrtimer_cancel(). See perf_swevent_hrtimer()
+ */
+ if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
@@ -11804,7 +11834,8 @@ static void cpu_clock_event_start(struct perf_event *event, int flags)
static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
perf_swevent_cancel_hrtimer(event);
- cpu_clock_event_update(event);
+ if (flags & PERF_EF_UPDATE)
+ cpu_clock_event_update(event);
}
static int cpu_clock_event_add(struct perf_event *event, int flags)
@@ -11882,7 +11913,8 @@ static void task_clock_event_start(struct perf_event *event, int flags)
static void task_clock_event_stop(struct perf_event *event, int flags)
{
perf_swevent_cancel_hrtimer(event);
- task_clock_event_update(event, event->ctx->time);
+ if (flags & PERF_EF_UPDATE)
+ task_clock_event_update(event, event->ctx->time);
}
static int task_clock_event_add(struct perf_event *event, int flags)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index d2aef87c7e9f..aa9a759e824f 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -441,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* store that will be enabled on successful return
*/
if (!handle->size) { /* A, matches D */
- event->pending_disable = smp_processor_id();
+ perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
WRITE_ONCE(rb->aux_nest, 0);
goto err_put;
@@ -526,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
- handle->event->pending_disable = smp_processor_id();
+ perf_event_disable_inatomic(handle->event);
perf_output_wakeup(handle);
}
diff --git a/kernel/exit.c b/kernel/exit.c
index bd743900354c..bb184a67ac73 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -940,6 +940,15 @@ void __noreturn do_exit(long code)
taskstats_exit(tsk, group_dead);
trace_sched_process_exit(tsk, group_dead);
+ /*
+ * Since sampling can touch ->mm, make sure to stop everything before we
+ * tear it down.
+ *
+ * Also flushes inherited counters to the parent - before the parent
+ * gets woken up by child-exit notifications.
+ */
+ perf_event_exit_task(tsk);
+
exit_mm();
if (group_dead)
@@ -955,14 +964,6 @@ void __noreturn do_exit(long code)
exit_task_work(tsk);
exit_thread(tsk);
- /*
- * Flush inherited counters to the parent - before the parent
- * gets woken up by child-exit notifications.
- *
- * because of cgroup mode, must be called before cgroup_exit()
- */
- perf_event_exit_task(tsk);
-
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..aef41211c72c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,7 +93,7 @@
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
-#include <linux/stackleak.h>
+#include <linux/kstack_erase.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
@@ -1046,7 +1046,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
- futex_mm_init(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
mm->pmd_huge_pte = NULL;
#endif
@@ -1061,6 +1060,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->def_flags = 0;
}
+ if (futex_mm_init(mm))
+ goto fail_mm_init;
+
if (mm_alloc_pgd(mm))
goto fail_nopgd;
@@ -1090,6 +1092,8 @@ fail_nocontext:
fail_noid:
mm_free_pgd(mm);
fail_nopgd:
+ futex_hash_free(mm);
+fail_mm_init:
free_mm(mm);
return NULL;
}
@@ -1145,7 +1149,7 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
static void mmput_async_fn(struct work_struct *work)
{
struct mm_struct *mm = container_of(work, struct mm_struct,
@@ -1542,14 +1546,14 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
/* "users" and "in_exec" locked for check_unsafe_exec() */
if (fs->in_exec) {
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
return -EAGAIN;
}
fs->users++;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
return 0;
}
tsk->fs = copy_fs_struct(fs);
@@ -2123,9 +2127,8 @@ __latent_entropy struct task_struct *copy_process(
lockdep_init_task(p);
#endif
-#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
-#endif
+
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
@@ -2743,7 +2746,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
}
#endif
-noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
{
@@ -3149,13 +3152,13 @@ int ksys_unshare(unsigned long unshare_flags)
if (new_fs) {
fs = current->fs;
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
current->fs = new_fs;
if (--fs->users)
new_fs = NULL;
else
new_fs = fs;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
}
if (new_fd)
@@ -3216,7 +3219,7 @@ int unshare_files(void)
return 0;
}
-int sysctl_max_threads(const struct ctl_table *table, int write,
+static int sysctl_max_threads(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -3238,3 +3241,21 @@ int sysctl_max_threads(const struct ctl_table *table, int write,
return 0;
}
+
+static const struct ctl_table fork_sysctl_table[] = {
+ {
+ .procname = "threads-max",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_max_threads,
+ },
+};
+
+static int __init init_fork_sysctl(void)
+{
+ register_sysctl_init("kernel", fork_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_fork_sysctl);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 8d530d0949ff..6a96149aede9 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -201,18 +201,9 @@ static int __restore_freezer_state(struct task_struct *p, void *arg)
void __thaw_task(struct task_struct *p)
{
- unsigned long flags;
-
- spin_lock_irqsave(&freezer_lock, flags);
- if (WARN_ON_ONCE(freezing(p)))
- goto unlock;
-
- if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL))
- goto unlock;
-
- wake_up_state(p, TASK_FROZEN);
-unlock:
- spin_unlock_irqrestore(&freezer_lock, flags);
+ guard(spinlock_irqsave)(&freezer_lock);
+ if (frozen(p) && !task_call_func(p, __restore_freezer_state, NULL))
+ wake_up_state(p, TASK_FROZEN);
}
/**
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 565f9717c6ca..d9bb5567af0c 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -42,7 +42,6 @@
#include <linux/fault-inject.h>
#include <linux/slab.h>
#include <linux/prctl.h>
-#include <linux/rcuref.h>
#include <linux/mempolicy.h>
#include <linux/mmap_lock.h>
@@ -65,12 +64,11 @@ static struct {
#define futex_queues (__futex_data.queues)
struct futex_private_hash {
- rcuref_t users;
+ int state;
unsigned int hash_mask;
struct rcu_head rcu;
void *mm;
bool custom;
- bool immutable;
struct futex_hash_bucket queues[];
};
@@ -129,6 +127,12 @@ static struct futex_hash_bucket *
__futex_hash(union futex_key *key, struct futex_private_hash *fph);
#ifdef CONFIG_FUTEX_PRIVATE_HASH
+static bool futex_ref_get(struct futex_private_hash *fph);
+static bool futex_ref_put(struct futex_private_hash *fph);
+static bool futex_ref_is_dead(struct futex_private_hash *fph);
+
+enum { FR_PERCPU = 0, FR_ATOMIC };
+
static inline bool futex_key_is_private(union futex_key *key)
{
/*
@@ -138,19 +142,14 @@ static inline bool futex_key_is_private(union futex_key *key)
return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED));
}
-bool futex_private_hash_get(struct futex_private_hash *fph)
+static bool futex_private_hash_get(struct futex_private_hash *fph)
{
- if (fph->immutable)
- return true;
- return rcuref_get(&fph->users);
+ return futex_ref_get(fph);
}
void futex_private_hash_put(struct futex_private_hash *fph)
{
- /* Ignore return value, last put is verified via rcuref_is_dead() */
- if (fph->immutable)
- return;
- if (rcuref_put(&fph->users))
+ if (futex_ref_put(fph))
wake_up_var(fph->mm);
}
@@ -243,14 +242,18 @@ static bool __futex_pivot_hash(struct mm_struct *mm,
fph = rcu_dereference_protected(mm->futex_phash,
lockdep_is_held(&mm->futex_hash_lock));
if (fph) {
- if (!rcuref_is_dead(&fph->users)) {
+ if (!futex_ref_is_dead(fph)) {
mm->futex_phash_new = new;
return false;
}
futex_rehash_private(fph, new);
}
- rcu_assign_pointer(mm->futex_phash, new);
+ new->state = FR_PERCPU;
+ scoped_guard(rcu) {
+ mm->futex_batches = get_state_synchronize_rcu();
+ rcu_assign_pointer(mm->futex_phash, new);
+ }
kvfree_rcu(fph, rcu);
return true;
}
@@ -289,9 +292,7 @@ again:
if (!fph)
return NULL;
- if (fph->immutable)
- return fph;
- if (rcuref_get(&fph->users))
+ if (futex_private_hash_get(fph))
return fph;
}
futex_pivot_hash(mm);
@@ -583,8 +584,8 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
if (futex_get_value(&node, naddr))
return -EFAULT;
- if (node != FUTEX_NO_NODE &&
- (node >= MAX_NUMNODES || !node_possible(node)))
+ if ((node != FUTEX_NO_NODE) &&
+ ((unsigned int)node >= MAX_NUMNODES || !node_possible(node)))
return -EINVAL;
}
@@ -1524,19 +1525,221 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb,
}
#define FH_CUSTOM 0x01
-#define FH_IMMUTABLE 0x02
#ifdef CONFIG_FUTEX_PRIVATE_HASH
+
+/*
+ * futex-ref
+ *
+ * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that
+ * code because it just doesn't fit right.
+ *
+ * Dual counter, per-cpu / atomic approach like percpu-refcount, except it
+ * re-initializes the state automatically, such that the fph swizzle is also a
+ * transition back to per-cpu.
+ */
+
+static void futex_ref_rcu(struct rcu_head *head);
+
+static void __futex_ref_atomic_begin(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * The counter we're about to switch to must have fully switched;
+ * otherwise it would be impossible for it to have reported success
+ * from futex_ref_is_dead().
+ */
+ WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0);
+
+ /*
+ * Set the atomic to the bias value such that futex_ref_{get,put}()
+ * will never observe 0. Will be fixed up in __futex_ref_atomic_end()
+ * when folding in the percpu count.
+ */
+ atomic_long_set(&mm->futex_atomic, LONG_MAX);
+ smp_store_release(&fph->state, FR_ATOMIC);
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static void __futex_ref_atomic_end(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+ unsigned int count = 0;
+ long ret;
+ int cpu;
+
+ /*
+ * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC
+ * and per this RCU callback, everybody must now observe this state and
+ * use the atomic variable.
+ */
+ WARN_ON_ONCE(fph->state != FR_ATOMIC);
+
+ /*
+ * Therefore the per-cpu counter is now stable, sum and reset.
+ */
+ for_each_possible_cpu(cpu) {
+ unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu);
+ count += *ptr;
+ *ptr = 0;
+ }
+
+ /*
+ * Re-init for the next cycle.
+ */
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+
+ /*
+ * Add actual count, subtract bias and initial refcount.
+ *
+ * The moment this atomic operation happens, futex_ref_is_dead() can
+ * become true.
+ */
+ ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic);
+ if (!ret)
+ wake_up_var(mm);
+
+ WARN_ON_ONCE(ret < 0);
+ mmput_async(mm);
+}
+
+static void futex_ref_rcu(struct rcu_head *head)
+{
+ struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu);
+ struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash);
+
+ if (fph->state == FR_PERCPU) {
+ /*
+ * Per this extra grace-period, everybody must now observe
+ * fph as the current fph and no previously observed fph's
+ * are in-flight.
+ *
+ * Notably, nobody will now rely on the atomic
+ * futex_ref_is_dead() state anymore so we can begin the
+ * migration of the per-cpu counter into the atomic.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ __futex_ref_atomic_end(fph);
+}
+
+/*
+ * Drop the initial refcount and transition to atomics.
+ */
+static void futex_ref_drop(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ /*
+ * Can only transition the current fph;
+ */
+ WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph);
+ /*
+ * We enqueue at least one RCU callback. Ensure mm stays if the task
+ * exits before the transition is completed.
+ */
+ mmget(mm);
+
+ /*
+ * In order to avoid the following scenario:
+ *
+ * futex_hash() __futex_pivot_hash()
+ * guard(rcu); guard(mm->futex_hash_lock);
+ * fph = mm->futex_phash;
+ * rcu_assign_pointer(&mm->futex_phash, new);
+ * futex_hash_allocate()
+ * futex_ref_drop()
+ * fph->state = FR_ATOMIC;
+ * atomic_set(, BIAS);
+ *
+ * futex_private_hash_get(fph); // OOPS
+ *
+ * Where an old fph (which is FR_ATOMIC) and should fail on
+ * inc_not_zero, will succeed because a new transition is started and
+ * the atomic is bias'ed away from 0.
+ *
+ * There must be at least one full grace-period between publishing a
+ * new fph and trying to replace it.
+ */
+ if (poll_state_synchronize_rcu(mm->futex_batches)) {
+ /*
+ * There was a grace-period, we can begin now.
+ */
+ __futex_ref_atomic_begin(fph);
+ return;
+ }
+
+ call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu);
+}
+
+static bool futex_ref_get(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU) {
+ this_cpu_inc(*mm->futex_ref);
+ return true;
+ }
+
+ return atomic_long_inc_not_zero(&mm->futex_atomic);
+}
+
+static bool futex_ref_put(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU) {
+ this_cpu_dec(*mm->futex_ref);
+ return false;
+ }
+
+ return atomic_long_dec_and_test(&mm->futex_atomic);
+}
+
+static bool futex_ref_is_dead(struct futex_private_hash *fph)
+{
+ struct mm_struct *mm = fph->mm;
+
+ guard(rcu)();
+
+ if (smp_load_acquire(&fph->state) == FR_PERCPU)
+ return false;
+
+ return atomic_long_read(&mm->futex_atomic) == 0;
+}
+
+int futex_mm_init(struct mm_struct *mm)
+{
+ mutex_init(&mm->futex_hash_lock);
+ RCU_INIT_POINTER(mm->futex_phash, NULL);
+ mm->futex_phash_new = NULL;
+ /* futex-ref */
+ atomic_long_set(&mm->futex_atomic, 0);
+ mm->futex_batches = get_state_synchronize_rcu();
+ mm->futex_ref = alloc_percpu(unsigned int);
+ if (!mm->futex_ref)
+ return -ENOMEM;
+ this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */
+ return 0;
+}
+
void futex_hash_free(struct mm_struct *mm)
{
struct futex_private_hash *fph;
+ free_percpu(mm->futex_ref);
kvfree(mm->futex_phash_new);
fph = rcu_dereference_raw(mm->futex_phash);
- if (fph) {
- WARN_ON_ONCE(rcuref_read(&fph->users) > 1);
+ if (fph)
kvfree(fph);
- }
}
static bool futex_pivot_pending(struct mm_struct *mm)
@@ -1549,7 +1752,7 @@ static bool futex_pivot_pending(struct mm_struct *mm)
return true;
fph = rcu_dereference(mm->futex_phash);
- return rcuref_is_dead(&fph->users);
+ return futex_ref_is_dead(fph);
}
static bool futex_hash_less(struct futex_private_hash *a,
@@ -1591,21 +1794,20 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
*/
scoped_guard(rcu) {
fph = rcu_dereference(mm->futex_phash);
- if (fph && (!fph->hash_mask || fph->immutable)) {
+ if (fph && !fph->hash_mask) {
if (custom)
return -EBUSY;
return 0;
}
}
- fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ fph = kvzalloc(struct_size(fph, queues, hash_slots),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!fph)
return -ENOMEM;
- rcuref_init(&fph->users, 1);
fph->hash_mask = hash_slots ? hash_slots - 1 : 0;
fph->custom = custom;
- fph->immutable = !!(flags & FH_IMMUTABLE);
fph->mm = mm;
for (i = 0; i < hash_slots; i++)
@@ -1629,13 +1831,23 @@ again:
mm->futex_phash_new = NULL;
if (fph) {
+ if (cur && !cur->hash_mask) {
+ /*
+ * If two threads simultaneously request the global
+ * hash then the first one performs the switch,
+ * the second one returns here.
+ */
+ free = fph;
+ mm->futex_phash_new = new;
+ return -EBUSY;
+ }
if (cur && !new) {
/*
* If we have an existing hash, but do not yet have
* allocated a replacement hash, drop the initial
* reference on the existing hash.
*/
- futex_private_hash_put(cur);
+ futex_ref_drop(cur);
}
if (new) {
@@ -1712,19 +1924,6 @@ static int futex_hash_get_slots(void)
return 0;
}
-static int futex_hash_get_immutable(void)
-{
- struct futex_private_hash *fph;
-
- guard(rcu)();
- fph = rcu_dereference(current->mm->futex_phash);
- if (fph && fph->immutable)
- return 1;
- if (fph && !fph->hash_mask)
- return 1;
- return 0;
-}
-
#else
static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags)
@@ -1737,10 +1936,6 @@ static int futex_hash_get_slots(void)
return 0;
}
-static int futex_hash_get_immutable(void)
-{
- return 0;
-}
#endif
int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
@@ -1750,10 +1945,8 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
switch (arg2) {
case PR_FUTEX_HASH_SET_SLOTS:
- if (arg4 & ~FH_FLAG_IMMUTABLE)
+ if (arg4)
return -EINVAL;
- if (arg4 & FH_FLAG_IMMUTABLE)
- flags |= FH_IMMUTABLE;
ret = futex_hash_allocate(arg3, flags);
break;
@@ -1761,10 +1954,6 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4)
ret = futex_hash_get_slots();
break;
- case PR_FUTEX_HASH_GET_IMMUTABLE:
- ret = futex_hash_get_immutable();
- break;
-
default:
ret = -EINVAL;
break;
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index fcd1617212ee..c74eac572acd 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -228,14 +228,12 @@ extern void futex_hash_get(struct futex_hash_bucket *hb);
extern void futex_hash_put(struct futex_hash_bucket *hb);
extern struct futex_private_hash *futex_private_hash(void);
-extern bool futex_private_hash_get(struct futex_private_hash *fph);
extern void futex_private_hash_put(struct futex_private_hash *fph);
#else /* !CONFIG_FUTEX_PRIVATE_HASH */
static inline void futex_hash_get(struct futex_hash_bucket *hb) { }
static inline void futex_hash_put(struct futex_hash_bucket *hb) { }
static inline struct futex_private_hash *futex_private_hash(void) { return NULL; }
-static inline bool futex_private_hash_get(void) { return false; }
static inline void futex_private_hash_put(struct futex_private_hash *fph) { }
#endif
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 3f02a0e45254..1da5e9d9da71 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -144,6 +144,17 @@ config GENERIC_IRQ_DEBUGFS
config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
bool
+config IRQ_KUNIT_TEST
+ bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y
+ default KUNIT_ALL_TESTS
+ imply SMP
+ help
+ This option enables KUnit tests for the IRQ subsystem API. These are
+ only for development and testing, not for regular kernel use cases.
+
+ If unsure, say N.
+
endmenu
config GENERIC_IRQ_MULTI_HANDLER
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index c0f44c06d69d..6ab3a4055667 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
obj-$(CONFIG_SMP) += affinity.o
obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
+obj-$(CONFIG_IRQ_KUNIT_TEST) += irq_test.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 44a4eba80315..4013e6ad2b2f 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -69,21 +69,20 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
* have multiple sets, build each sets affinity mask separately.
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
- unsigned int this_vecs = affd->set_size[i];
- int j;
- struct cpumask *result = group_cpus_evenly(this_vecs);
+ unsigned int nr_masks, this_vecs = affd->set_size[i];
+ struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks);
if (!result) {
kfree(masks);
return NULL;
}
- for (j = 0; j < this_vecs; j++)
+ for (int j = 0; j < nr_masks; j++)
cpumask_copy(&masks[curvec + j].mask, &result[j]);
kfree(result);
- curvec += this_vecs;
- usedvecs += this_vecs;
+ curvec += nr_masks;
+ usedvecs += nr_masks;
}
/* Fill out vectors at the end that don't need affinity */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b0e0a7332993..624106e886ad 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -205,6 +205,14 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
void irq_startup_managed(struct irq_desc *desc)
{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+
+ /*
+ * Clear managed-shutdown flag, so we don't repeat managed-startup for
+ * multiple hotplugs, and cause imbalanced disable depth.
+ */
+ irqd_clr_managed_shutdown(d);
+
/*
* Only start it up when the disable depth is 1, so that a disable,
* hotunplug, hotplug sequence does not end up enabling it during
@@ -449,22 +457,33 @@ void unmask_threaded_irq(struct irq_desc *desc)
unmask_irq(desc);
}
-static bool irq_check_poll(struct irq_desc *desc)
+/* Busy wait until INPROGRESS is cleared */
+static bool irq_wait_on_inprogress(struct irq_desc *desc)
{
- if (!(desc->istate & IRQS_POLL_INPROGRESS))
- return false;
- return irq_wait_for_poll(desc);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ do {
+ raw_spin_unlock(&desc->lock);
+ while (irqd_irq_inprogress(&desc->irq_data))
+ cpu_relax();
+ raw_spin_lock(&desc->lock);
+ } while (irqd_irq_inprogress(&desc->irq_data));
+
+ /* Might have been disabled in meantime */
+ return !irqd_irq_disabled(&desc->irq_data) && desc->action;
+ }
+ return false;
}
static bool irq_can_handle_pm(struct irq_desc *desc)
{
- unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
+ struct irq_data *irqd = &desc->irq_data;
+ const struct cpumask *aff;
/*
* If the interrupt is not in progress and is not an armed
* wakeup interrupt, proceed.
*/
- if (!irqd_has_set(&desc->irq_data, mask))
+ if (!irqd_has_set(irqd, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED))
return true;
/*
@@ -472,13 +491,54 @@ static bool irq_can_handle_pm(struct irq_desc *desc)
* and suspended, disable it and notify the pm core about the
* event.
*/
- if (irq_pm_check_wakeup(desc))
+ if (unlikely(irqd_has_set(irqd, IRQD_WAKEUP_ARMED))) {
+ irq_pm_handle_wakeup(desc);
+ return false;
+ }
+
+ /* Check whether the interrupt is polled on another CPU */
+ if (unlikely(desc->istate & IRQS_POLL_INPROGRESS)) {
+ if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+ "irq poll in progress on cpu %d for irq %d\n",
+ smp_processor_id(), desc->irq_data.irq))
+ return false;
+ return irq_wait_on_inprogress(desc);
+ }
+
+ /* The below works only for single target interrupts */
+ if (!IS_ENABLED(CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK) ||
+ !irqd_is_single_target(irqd) || desc->handle_irq != handle_edge_irq)
return false;
/*
- * Handle a potential concurrent poll on a different core.
+ * If the interrupt affinity was moved to this CPU and the
+ * interrupt is currently handled on the previous target CPU, then
+ * busy wait for INPROGRESS to be cleared. Otherwise for edge type
+ * interrupts the handler might get stuck on the previous target:
+ *
+ * CPU 0 CPU 1 (new target)
+ * handle_edge_irq()
+ * repeat:
+ * handle_event() handle_edge_irq()
+ * if (INPROGESS) {
+ * set(PENDING);
+ * mask();
+ * return;
+ * }
+ * if (PENDING) {
+ * clear(PENDING);
+ * unmask();
+ * goto repeat;
+ * }
+ *
+ * This happens when the device raises interrupts with a high rate
+ * and always before handle_event() completes and the CPU0 handler
+ * can clear INPROGRESS. This has been observed in virtual machines.
*/
- return irq_check_poll(desc);
+ aff = irq_data_get_effective_affinity_mask(irqd);
+ if (cpumask_first(aff) != smp_processor_id())
+ return false;
+ return irq_wait_on_inprogress(desc);
}
static inline bool irq_can_handle_actions(struct irq_desc *desc)
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index f07529ae4895..755346ea9819 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -210,13 +210,6 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu)
!irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity))
return;
- /*
- * Don't restore suspended interrupts here when a system comes back
- * from S3. They are reenabled via resume_device_irqs().
- */
- if (desc->istate & IRQS_SUSPENDED)
- return;
-
if (irqd_is_managed_and_shutdown(data))
irq_startup_managed(desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index aebfe225c9a6..0164ca48da59 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -20,6 +20,7 @@
#define istate core_internal_state__do_not_mess_with_it
extern bool noirqdebug;
+extern int irq_poll_cpu;
extern struct irqaction chained_action;
@@ -112,7 +113,6 @@ irqreturn_t handle_irq_event(struct irq_desc *desc);
int check_irq_resend(struct irq_desc *desc, bool inject);
void clear_irq_resend(struct irq_desc *desc);
void irq_resend_init(struct irq_desc *desc);
-bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
void wake_threads_waitq(struct irq_desc *desc);
@@ -277,11 +277,11 @@ static inline bool irq_is_nmi(struct irq_desc *desc)
}
#ifdef CONFIG_PM_SLEEP
-bool irq_pm_check_wakeup(struct irq_desc *desc);
+void irq_pm_handle_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
#else
-static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
+static inline void irq_pm_handle_wakeup(struct irq_desc *desc) { }
static inline void
irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
static inline void
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 1a3d483548e2..ae4c9cbd1b4b 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -202,7 +202,7 @@ struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode,
void *data)
{
struct irq_sim_work_ctx *work_ctx __free(kfree) =
- kmalloc(sizeof(*work_ctx), GFP_KERNEL);
+ kzalloc(sizeof(*work_ctx), GFP_KERNEL);
if (!work_ctx)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c
new file mode 100644
index 000000000000..5161b56a12f9
--- /dev/null
+++ b/kernel/irq/irq_test.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: LGPL-2.1+
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/irqdomain.h>
+#include <linux/nodemask.h>
+#include <kunit/test.h>
+
+#include "internals.h"
+
+static irqreturn_t noop_handler(int irq, void *data)
+{
+ return IRQ_HANDLED;
+}
+
+static void noop(struct irq_data *data) { }
+static unsigned int noop_ret(struct irq_data *data) { return 0; }
+
+static int noop_affinity(struct irq_data *data, const struct cpumask *dest,
+ bool force)
+{
+ irq_data_update_effective_affinity(data, dest);
+
+ return 0;
+}
+
+static struct irq_chip fake_irq_chip = {
+ .name = "fake",
+ .irq_startup = noop_ret,
+ .irq_shutdown = noop,
+ .irq_enable = noop,
+ .irq_disable = noop,
+ .irq_ack = noop,
+ .irq_mask = noop,
+ .irq_unmask = noop,
+ .irq_set_affinity = noop_affinity,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static void irq_disable_depth_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ int virq, ret;
+
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL);
+ KUNIT_ASSERT_GE(test, virq, 0);
+
+ irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_free_disabled_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ int virq, ret;
+
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL);
+ KUNIT_ASSERT_GE(test, virq, 0);
+
+ irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ free_irq(virq, NULL);
+ KUNIT_EXPECT_GE(test, desc->depth, 1);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_shutdown_depth_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ int virq, ret;
+ struct irq_affinity_desc affinity = {
+ .is_managed = 1,
+ .mask = CPU_MASK_ALL,
+ };
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ kunit_skip(test, "requires CONFIG_SMP for managed shutdown");
+
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity);
+ KUNIT_ASSERT_GE(test, virq, 0);
+
+ irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ data = irq_desc_get_irq_data(desc);
+ KUNIT_ASSERT_PTR_NE(test, data, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data));
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ irq_shutdown_and_deactivate(desc);
+
+ KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
+
+ KUNIT_EXPECT_EQ(test, irq_activate(desc), 0);
+#ifdef CONFIG_SMP
+ irq_startup_managed(desc);
+#endif
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static void irq_cpuhotplug_test(struct kunit *test)
+{
+ struct irq_desc *desc;
+ struct irq_data *data;
+ int virq, ret;
+ struct irq_affinity_desc affinity = {
+ .is_managed = 1,
+ };
+
+ if (!IS_ENABLED(CONFIG_SMP))
+ kunit_skip(test, "requires CONFIG_SMP for CPU hotplug");
+ if (!get_cpu_device(1))
+ kunit_skip(test, "requires more than 1 CPU for CPU hotplug");
+ if (!cpu_is_hotpluggable(1))
+ kunit_skip(test, "CPU 1 must be hotpluggable");
+
+ cpumask_copy(&affinity.mask, cpumask_of(1));
+
+ virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity);
+ KUNIT_ASSERT_GE(test, virq, 0);
+
+ irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq);
+
+ desc = irq_to_desc(virq);
+ KUNIT_ASSERT_PTR_NE(test, desc, NULL);
+
+ data = irq_desc_get_irq_data(desc);
+ KUNIT_ASSERT_PTR_NE(test, data, NULL);
+
+ ret = request_irq(virq, noop_handler, 0, "test_irq", NULL);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data));
+
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ disable_irq(virq);
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ KUNIT_EXPECT_EQ(test, remove_cpu(1), 0);
+ KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
+ KUNIT_EXPECT_GE(test, desc->depth, 1);
+ KUNIT_EXPECT_EQ(test, add_cpu(1), 0);
+
+ KUNIT_EXPECT_FALSE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_FALSE(test, irqd_is_started(data));
+ KUNIT_EXPECT_EQ(test, desc->depth, 1);
+
+ enable_irq(virq);
+ KUNIT_EXPECT_TRUE(test, irqd_is_activated(data));
+ KUNIT_EXPECT_TRUE(test, irqd_is_started(data));
+ KUNIT_EXPECT_EQ(test, desc->depth, 0);
+
+ free_irq(virq, NULL);
+}
+
+static struct kunit_case irq_test_cases[] = {
+ KUNIT_CASE(irq_disable_depth_test),
+ KUNIT_CASE(irq_free_disabled_test),
+ KUNIT_CASE(irq_shutdown_depth_test),
+ KUNIT_CASE(irq_cpuhotplug_test),
+ {}
+};
+
+static struct kunit_suite irq_test_suite = {
+ .name = "irq_test_cases",
+ .test_cases = irq_test_cases,
+};
+
+kunit_test_suite(irq_test_suite);
+MODULE_DESCRIPTION("IRQ unit test suite");
+MODULE_LICENSE("GPL");
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index c8b6de09047b..4afbd3ac532f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -317,6 +317,7 @@ static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info
domain->flags |= info->domain_flags;
domain->exit = info->exit;
+ domain->dev = info->dev;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
if (info->parent) {
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 9febe797a5f6..9b09ad3f9914 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -889,6 +889,7 @@ static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
if (domain) {
irq_domain_update_bus_token(domain, info->bus_token);
+ domain->dev = info->dev;
if (info->flags & MSI_FLAG_PARENT_PM_DEV)
domain->pm_dev = parent->pm_dev;
}
@@ -1051,6 +1052,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
bundle->info.data = domain_data;
bundle->info.chip_data = chip_data;
bundle->info.alloc_data = &bundle->alloc_info;
+ bundle->info.dev = dev;
pops = parent->msi_parent_ops;
snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
@@ -1089,7 +1091,6 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
if (!domain)
return false;
- domain->dev = dev;
dev->msi.data->__domains[domid].domain = domain;
if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) {
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 445912d51033..f7394729cedc 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -13,17 +13,13 @@
#include "internals.h"
-bool irq_pm_check_wakeup(struct irq_desc *desc)
+void irq_pm_handle_wakeup(struct irq_desc *desc)
{
- if (irqd_is_wakeup_armed(&desc->irq_data)) {
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
- desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
- desc->depth++;
- irq_disable(desc);
- pm_system_irq_wakeup(irq_desc_get_irq(desc));
- return true;
- }
- return false;
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
+ desc->depth++;
+ irq_disable(desc);
+ pm_system_irq_wakeup(irq_desc_get_irq(desc));
}
/*
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 8f26982e7300..73280ccb74b0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -19,45 +19,10 @@ static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
static void poll_spurious_irqs(struct timer_list *unused);
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
-static int irq_poll_cpu;
+int irq_poll_cpu;
static atomic_t irq_poll_active;
/*
- * We wait here for a poller to finish.
- *
- * If the poll runs on this CPU, then we yell loudly and return
- * false. That will leave the interrupt line disabled in the worst
- * case, but it should never happen.
- *
- * We wait until the poller is done and then recheck disabled and
- * action (about to be disabled). Only if it's still active, we return
- * true and let the handler run.
- */
-bool irq_wait_for_poll(struct irq_desc *desc)
-{
- lockdep_assert_held(&desc->lock);
-
- if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
- "irq poll in progress on cpu %d for irq %d\n",
- smp_processor_id(), desc->irq_data.irq))
- return false;
-
-#ifdef CONFIG_SMP
- do {
- raw_spin_unlock(&desc->lock);
- while (irqd_irq_inprogress(&desc->irq_data))
- cpu_relax();
- raw_spin_lock(&desc->lock);
- } while (irqd_irq_inprogress(&desc->irq_data));
- /* Might have been disabled in meantime */
- return !irqd_irq_disabled(&desc->irq_data) && desc->action;
-#else
- return false;
-#endif
-}
-
-
-/*
* Recovery handler for misrouted interrupts.
*/
static bool try_one_irq(struct irq_desc *desc, bool force)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 9c59fa480b0b..351cd7d76dfa 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1080,7 +1080,7 @@ int kernel_kexec(void)
console_suspend_all();
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
- goto Resume_console;
+ goto Resume_devices;
/*
* dpm_suspend_end() must be called after dpm_suspend_start()
* to complete the transition, like in the hibernation flows
@@ -1135,7 +1135,6 @@ int kernel_kexec(void)
dpm_resume_start(PMSG_RESTORE);
Resume_devices:
dpm_resume_end(PMSG_RESTORE);
- Resume_console:
console_resume_all();
thaw_processes();
Restore_console:
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 69fe76fd9233..b835033c65eb 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -751,7 +751,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
/* Calculate and store the digest of segments */
static int kexec_calculate_store_digests(struct kimage *image)
{
- struct sha256_state state;
+ struct sha256_ctx sctx;
int ret = 0, i, j, zero_buf_sz, sha_region_sz;
size_t nullsz;
u8 digest[SHA256_DIGEST_SIZE];
@@ -770,7 +770,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
if (!sha_regions)
return -ENOMEM;
- sha256_init(&state);
+ sha256_init(&sctx);
for (j = i = 0; i < image->nr_segments; i++) {
struct kexec_segment *ksegment;
@@ -796,7 +796,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
if (check_ima_segment_index(image, i))
continue;
- sha256_update(&state, ksegment->kbuf, ksegment->bufsz);
+ sha256_update(&sctx, ksegment->kbuf, ksegment->bufsz);
/*
* Assume rest of the buffer is filled with zero and
@@ -808,7 +808,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
if (bytes > zero_buf_sz)
bytes = zero_buf_sz;
- sha256_update(&state, zero_buf, bytes);
+ sha256_update(&sctx, zero_buf, bytes);
nullsz -= bytes;
}
@@ -817,7 +817,7 @@ static int kexec_calculate_store_digests(struct kimage *image)
j++;
}
- sha256_final(&state, digest);
+ sha256_final(&sctx, digest);
ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions",
sha_regions, sha_region_sz, 0);
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 69b953551677..d3b13a909913 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -164,11 +164,21 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
}
/* almost as free_reserved_page(), just don't free the page */
-static void kho_restore_page(struct page *page)
+static void kho_restore_page(struct page *page, unsigned int order)
{
- ClearPageReserved(page);
- init_page_count(page);
- adjust_managed_page_count(page, 1);
+ unsigned int nr_pages = (1 << order);
+
+ /* Head page gets refcount of 1. */
+ set_page_count(page, 1);
+
+ /* For higher order folios, tail pages get a page count of zero. */
+ for (unsigned int i = 1; i < nr_pages; i++)
+ set_page_count(page + i, 0);
+
+ if (order > 0)
+ prep_compound_page(page, order);
+
+ adjust_managed_page_count(page, nr_pages);
}
/**
@@ -186,15 +196,10 @@ struct folio *kho_restore_folio(phys_addr_t phys)
return NULL;
order = page->private;
- if (order) {
- if (order > MAX_PAGE_ORDER)
- return NULL;
-
- prep_compound_page(page, order);
- } else {
- kho_restore_page(page);
- }
+ if (order > MAX_PAGE_ORDER)
+ return NULL;
+ kho_restore_page(page, order);
return page_folio(page);
}
EXPORT_SYMBOL_GPL(kho_restore_folio);
@@ -305,8 +310,8 @@ err_free:
return -ENOMEM;
}
-static void deserialize_bitmap(unsigned int order,
- struct khoser_mem_bitmap_ptr *elm)
+static void __init deserialize_bitmap(unsigned int order,
+ struct khoser_mem_bitmap_ptr *elm)
{
struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
unsigned long bit;
diff --git a/kernel/stackleak.c b/kernel/kstack_erase.c
index bb65321761b4..e49bb88b4f0a 100644
--- a/kernel/stackleak.c
+++ b/kernel/kstack_erase.c
@@ -6,14 +6,14 @@
*
* Author: Alexander Popov <alex.popov@linux.com>
*
- * STACKLEAK reduces the information which kernel stack leak bugs can
+ * KSTACK_ERASE reduces the information which kernel stack leak bugs can
* reveal and blocks some uninitialized stack variable attacks.
*/
-#include <linux/stackleak.h>
+#include <linux/kstack_erase.h>
#include <linux/kprobes.h>
-#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
+#ifdef CONFIG_KSTACK_ERASE_RUNTIME_DISABLE
#include <linux/jump_label.h>
#include <linux/string_choices.h>
#include <linux/sysctl.h>
@@ -68,7 +68,7 @@ late_initcall(stackleak_sysctls_init);
#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass)
#else
#define skip_erasing() false
-#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
+#endif /* CONFIG_KSTACK_ERASE_RUNTIME_DISABLE */
#ifndef __stackleak_poison
static __always_inline void __stackleak_poison(unsigned long erase_low,
@@ -91,7 +91,7 @@ static __always_inline void __stackleak_erase(bool on_task_stack)
erase_low = stackleak_find_top_of_poison(task_stack_low,
current->lowest_stack);
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
current->prev_lowest_stack = erase_low;
#endif
@@ -113,7 +113,7 @@ static __always_inline void __stackleak_erase(bool on_task_stack)
else
erase_high = task_stack_high;
- __stackleak_poison(erase_low, erase_high, STACKLEAK_POISON);
+ __stackleak_poison(erase_low, erase_high, KSTACK_ERASE_POISON);
/* Reset the 'lowest_stack' value for the next syscall */
current->lowest_stack = task_stack_high;
@@ -156,16 +156,16 @@ asmlinkage void noinstr stackleak_erase_off_task_stack(void)
__stackleak_erase(false);
}
-void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
+void __used __no_caller_saved_registers noinstr __sanitizer_cov_stack_depth(void)
{
unsigned long sp = current_stack_pointer;
/*
- * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than
- * STACKLEAK_SEARCH_DEPTH makes the poison search in
+ * Having CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE larger than
+ * KSTACK_ERASE_SEARCH_DEPTH makes the poison search in
* stackleak_erase() unreliable. Let's prevent that.
*/
- BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH);
+ BUILD_BUG_ON(CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE > KSTACK_ERASE_SEARCH_DEPTH);
/* 'lowest_stack' should be aligned on the register width boundary */
sp = ALIGN(sp, sizeof(unsigned long));
@@ -174,4 +174,4 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
current->lowest_stack = sp;
}
}
-EXPORT_SYMBOL(stackleak_track_stack);
+EXPORT_SYMBOL(__sanitizer_cov_stack_depth);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index dd2bbf73718b..2d4c5bab5af8 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -297,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
dst->nr += src->nr;
}
-struct lock_class_stats lock_stats(struct lock_class *class)
+void lock_stats(struct lock_class *class, struct lock_class_stats *stats)
{
- struct lock_class_stats stats;
int cpu, i;
- memset(&stats, 0, sizeof(struct lock_class_stats));
+ memset(stats, 0, sizeof(struct lock_class_stats));
for_each_possible_cpu(cpu) {
struct lock_class_stats *pcs =
&per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
- for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
- stats.contention_point[i] += pcs->contention_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++)
+ stats->contention_point[i] += pcs->contention_point[i];
- for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
- stats.contending_point[i] += pcs->contending_point[i];
+ for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++)
+ stats->contending_point[i] += pcs->contending_point[i];
- lock_time_add(&pcs->read_waittime, &stats.read_waittime);
- lock_time_add(&pcs->write_waittime, &stats.write_waittime);
+ lock_time_add(&pcs->read_waittime, &stats->read_waittime);
+ lock_time_add(&pcs->write_waittime, &stats->write_waittime);
- lock_time_add(&pcs->read_holdtime, &stats.read_holdtime);
- lock_time_add(&pcs->write_holdtime, &stats.write_holdtime);
+ lock_time_add(&pcs->read_holdtime, &stats->read_holdtime);
+ lock_time_add(&pcs->write_holdtime, &stats->write_holdtime);
- for (i = 0; i < ARRAY_SIZE(stats.bounces); i++)
- stats.bounces[i] += pcs->bounces[i];
+ for (i = 0; i < ARRAY_SIZE(stats->bounces); i++)
+ stats->bounces[i] += pcs->bounces[i];
}
-
- return stats;
}
void clear_lock_stats(struct lock_class *class)
@@ -6619,8 +6616,16 @@ void lockdep_unregister_key(struct lock_class_key *key)
if (need_callback)
call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
- /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
- synchronize_rcu();
+ /*
+ * Wait until is_dynamic_key() has finished accessing k->hash_entry.
+ *
+ * Some operations like __qdisc_destroy() will call this in a debug
+ * kernel, and the network traffic is disabled while waiting, hence
+ * the delay of the wait matters in debugging cases. Currently use a
+ * synchronize_rcu_expedited() to speed up the wait at the cost of
+ * system IPIs. TODO: Replace RCU with hazptr for this.
+ */
+ synchronize_rcu_expedited();
}
EXPORT_SYMBOL_GPL(lockdep_unregister_key);
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 82156caf77d1..0e5e6ffe91a3 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -47,29 +47,31 @@ enum {
__LOCKF(USED_READ)
};
+enum {
#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
-static const unsigned long LOCKF_ENABLED_IRQ =
+ LOCKF_ENABLED_IRQ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE |
-static const unsigned long LOCKF_USED_IN_IRQ =
+ LOCKF_USED_IN_IRQ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ |
-static const unsigned long LOCKF_ENABLED_IRQ_READ =
+ LOCKF_ENABLED_IRQ_READ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ |
-static const unsigned long LOCKF_USED_IN_IRQ_READ =
+ LOCKF_USED_IN_IRQ_READ =
#include "lockdep_states.h"
- 0;
+ 0,
#undef LOCKDEP_STATE
+};
#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b52c07c4707c..1916db9aa46b 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -657,7 +657,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
if (!test_bit(idx, lock_classes_in_use))
continue;
iter->class = class;
- iter->stats = lock_stats(class);
+ lock_stats(class, &iter->stats);
iter++;
}
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 6e6f6071cfa2..949103fd8e9b 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -53,17 +53,18 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
{
lockdep_assert_held(&lock->wait_lock);
- /* Mark the current thread as blocked on the lock: */
- task->blocked_on = waiter;
+ /* Current thread can't be already blocked (since it's executing!) */
+ DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task));
}
void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct task_struct *task)
{
+ struct mutex *blocked_on = __get_task_blocked_on(task);
+
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
DEBUG_LOCKS_WARN_ON(waiter->task != task);
- DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
- task->blocked_on = NULL;
+ DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock);
INIT_LIST_HEAD(&waiter->list);
waiter->task = NULL;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index a39ecccbd106..de7d6702cd96 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -191,9 +191,7 @@ static void
__mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct list_head *list)
{
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX);
-#endif
debug_mutex_add_waiter(lock, waiter, current);
list_add_tail(&waiter->list, list);
@@ -209,9 +207,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter)
__mutex_clear_flag(lock, MUTEX_FLAGS);
debug_mutex_remove_waiter(lock, waiter, current);
-#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
hung_task_clear_blocker();
-#endif
}
/*
@@ -644,6 +640,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
goto err_early_kill;
}
+ __set_task_blocked_on(current, lock);
set_current_state(state);
trace_contention_begin(lock, LCB_F_MUTEX);
for (;;) {
@@ -680,6 +677,12 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
first = __mutex_waiter_is_first(lock, &waiter);
+ /*
+ * As we likely have been woken up by task
+ * that has cleared our blocked_on state, re-set
+ * it to the lock we are trying to acquire.
+ */
+ set_task_blocked_on(current, lock);
set_current_state(state);
/*
* Here we order against unlock; we must either see it change
@@ -691,8 +694,15 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
if (first) {
trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ /*
+ * mutex_optimistic_spin() can call schedule(), so
+ * clear blocked on so we don't become unselectable
+ * to run.
+ */
+ clear_task_blocked_on(current, lock);
if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
break;
+ set_task_blocked_on(current, lock);
trace_contention_begin(lock, LCB_F_MUTEX);
}
@@ -700,6 +710,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
}
raw_spin_lock_irqsave(&lock->wait_lock, flags);
acquired:
+ __clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
if (ww_ctx) {
@@ -729,9 +740,11 @@ skip_wait:
return 0;
err:
+ __clear_task_blocked_on(current, lock);
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
+ WARN_ON(__get_task_blocked_on(current));
trace_contention_end(lock, ret);
raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
debug_mutex_free_waiter(&waiter);
@@ -942,6 +955,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
next = waiter->task;
debug_mutex_wake_waiter(lock, waiter);
+ __clear_task_blocked_on(next, lock);
wake_q_add(&wake_q, next);
}
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index cbff35b9b7ae..2e8080a9bee3 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -6,7 +6,7 @@
*
* Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
-
+#ifndef CONFIG_PREEMPT_RT
/*
* This is the control structure for tasks blocked on mutex, which resides
* on the blocked task's kernel stack:
@@ -70,3 +70,4 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
# define debug_mutex_unlock(lock) do { } while (0)
# define debug_mutex_init(lock, name, key) do { } while (0)
#endif /* !CONFIG_DEBUG_MUTEXES */
+#endif /* CONFIG_PREEMPT_RT */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 2d933528a0fa..bafd5af98eae 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -13,6 +13,24 @@
*/
int max_lock_depth = 1024;
+static const struct ctl_table rtmutex_sysctl_table[] = {
+ {
+ .procname = "max_lock_depth",
+ .data = &max_lock_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static int __init init_rtmutex_sysctl(void)
+{
+ register_sysctl_init("kernel", rtmutex_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_rtmutex_sysctl);
+
/*
* Debug aware fast / slowpath lock,trylock,unlock
*
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 2ddb827e3bea..8572dba95af4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -727,8 +727,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
return ret;
}
-#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
-
static inline enum owner_state
rwsem_owner_state(struct task_struct *owner, unsigned long flags)
{
@@ -835,7 +833,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
enum owner_state owner_state;
owner_state = rwsem_spin_on_owner(sem);
- if (!(owner_state & OWNER_SPINNABLE))
+ if (owner_state == OWNER_NONSPINNABLE)
break;
/*
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 37f025a096c9..086fd5487ca7 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -284,6 +284,12 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
#ifndef WW_RT
debug_mutex_wake_waiter(lock, waiter);
#endif
+ /*
+ * When waking up the task to die, be sure to clear the
+ * blocked_on pointer. Otherwise we can see circular
+ * blocked_on relationships that can't resolve.
+ */
+ __clear_task_blocked_on(waiter->task, lock);
wake_q_add(wake_q, waiter->task);
}
@@ -331,9 +337,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
* it's wounded in __ww_mutex_check_kill() or has a
* wakeup pending to re-read the wounded state.
*/
- if (owner != current)
+ if (owner != current) {
+ /*
+ * When waking up the task to wound, be sure to clear the
+ * blocked_on pointer. Otherwise we can see circular
+ * blocked_on relationships that can't resolve.
+ */
+ __clear_task_blocked_on(owner, lock);
wake_q_add(wake_q, owner);
-
+ }
return true;
}
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 8d74b0a21c82..51ddd8866ef3 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -58,6 +58,9 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const u32 __start___kcrctab[];
extern const u32 __start___kcrctab_gpl[];
+#define KMOD_PATH_LEN 256
+extern char modprobe_path[];
+
struct load_info {
const char *name;
/* pointer to module in temporary copy, freed at end of load_module() */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 413ac6ea3702..cdcc50a5353d 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -126,9 +126,37 @@ static void mod_update_bounds(struct module *mod)
}
/* Block module loading/unloading? */
-int modules_disabled;
+static int modules_disabled;
core_param(nomodule, modules_disabled, bint, 0);
+static const struct ctl_table module_sysctl_table[] = {
+ {
+ .procname = "modprobe",
+ .data = &modprobe_path,
+ .maxlen = KMOD_PATH_LEN,
+ .mode = 0644,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "modules_disabled",
+ .data = &modules_disabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init init_module_sysctl(void)
+{
+ register_sysctl_init("kernel", module_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_module_sysctl);
+
/* Waiting for a module to finish initializing? */
static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -1573,8 +1601,14 @@ static int apply_relocations(struct module *mod, const struct load_info *info)
if (infosec >= info->hdr->e_shnum)
continue;
- /* Don't bother with non-allocated sections */
- if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
+ /*
+ * Don't bother with non-allocated sections.
+ * An exception is the percpu section, which has separate allocations
+ * for individual CPUs. We relocate the percpu section in the initial
+ * ELF template and subsequently copy it to the per-CPU destinations.
+ */
+ if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC) &&
+ (!infosec || infosec != info->index.pcpu))
continue;
if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
@@ -2696,9 +2730,8 @@ static int find_module_sections(struct module *mod, struct load_info *info)
static int move_module(struct module *mod, struct load_info *info)
{
- int i;
- enum mod_mem_type t = 0;
- int ret = -ENOMEM;
+ int i, ret;
+ enum mod_mem_type t = MOD_MEM_NUM_TYPES;
bool codetag_section_found = false;
for_each_mod_mem_type(type) {
@@ -2776,7 +2809,7 @@ static int move_module(struct module *mod, struct load_info *info)
return 0;
out_err:
module_memory_restore_rox(mod);
- for (t--; t >= 0; t--)
+ while (t--)
module_memory_free(mod, t);
if (codetag_section_found)
codetag_free_module_sections(mod);
@@ -3368,7 +3401,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
module_allocated = true;
- audit_log_kern_module(mod->name);
+ audit_log_kern_module(info->name);
/* Reserve our place in the list. */
err = add_unformed_module(mod);
@@ -3532,8 +3565,10 @@ static int load_module(struct load_info *info, const char __user *uargs,
* failures once the proper module was allocated and
* before that.
*/
- if (!module_allocated)
+ if (!module_allocated) {
+ audit_log_kern_module(info->name ? info->name : "?");
mod_stat_bump_becoming(info, flags);
+ }
free_copy(info, flags);
return err;
}
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index b401ff4b02d2..c7622ff5226a 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -56,9 +56,9 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
{
const struct bin_attribute *const *bin_attr;
- for (bin_attr = sect_attrs->grp.bin_attrs_new; *bin_attr; bin_attr++)
+ for (bin_attr = sect_attrs->grp.bin_attrs; *bin_attr; bin_attr++)
kfree((*bin_attr)->attr.name);
- kfree(sect_attrs->grp.bin_attrs_new);
+ kfree(sect_attrs->grp.bin_attrs);
kfree(sect_attrs);
}
@@ -86,7 +86,7 @@ static int add_sect_attrs(struct module *mod, const struct load_info *info)
/* Setup section attributes. */
sect_attrs->grp.name = "sections";
- sect_attrs->grp.bin_attrs_new = gattr;
+ sect_attrs->grp.bin_attrs = gattr;
sattr = &sect_attrs->attrs[0];
for (i = 0; i < info->hdr->e_shnum; i++) {
@@ -101,7 +101,7 @@ static int add_sect_attrs(struct module *mod, const struct load_info *info)
ret = -ENOMEM;
goto out;
}
- sattr->read_new = module_sect_read;
+ sattr->read = module_sect_read;
sattr->private = (void *)sec->sh_addr;
sattr->size = MODULE_SECT_READ_SIZE;
sattr->attr.mode = 0400;
@@ -144,7 +144,7 @@ struct module_notes_attrs {
static void free_notes_attrs(struct module_notes_attrs *notes_attrs)
{
- kfree(notes_attrs->grp.bin_attrs_new);
+ kfree(notes_attrs->grp.bin_attrs);
kfree(notes_attrs);
}
@@ -178,7 +178,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info)
}
notes_attrs->grp.name = "notes";
- notes_attrs->grp.bin_attrs_new = gattr;
+ notes_attrs->grp.bin_attrs = gattr;
nattr = &notes_attrs->attrs[0];
for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
@@ -190,7 +190,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info)
nattr->attr.mode = 0444;
nattr->size = info->sechdrs[i].sh_size;
nattr->private = (void *)info->sechdrs[i].sh_addr;
- nattr->read_new = sysfs_bin_attr_simple_read;
+ nattr->read = sysfs_bin_attr_simple_read;
*(gattr++) = nattr++;
}
++loaded;
diff --git a/kernel/panic.c b/kernel/panic.c
index b0b9a8bf4560..64e58835086d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -84,6 +84,50 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
#ifdef CONFIG_SYSCTL
+
+/*
+ * Taint values can only be increased
+ * This means we can safely use a temporary.
+ */
+static int proc_taint(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long tmptaint = get_taint();
+ int err;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &tmptaint;
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ if (write) {
+ int i;
+
+ /*
+ * If we are relying on panic_on_taint not producing
+ * false positives due to userspace input, bail out
+ * before setting the requested taint flags.
+ */
+ if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint))
+ return -EINVAL;
+
+ /*
+ * Poor man's atomic or. Not worth adding a primitive
+ * to everyone's atomic.h for this
+ */
+ for (i = 0; i < TAINT_FLAGS_COUNT; i++)
+ if ((1UL << i) & tmptaint)
+ add_taint(i, LOCKDEP_STILL_OK);
+ }
+
+ return err;
+}
+
static const struct ctl_table kern_panic_table[] = {
#ifdef CONFIG_SMP
{
@@ -97,6 +141,12 @@ static const struct ctl_table kern_panic_table[] = {
},
#endif
{
+ .procname = "tainted",
+ .maxlen = sizeof(long),
+ .mode = 0644,
+ .proc_handler = proc_taint,
+ },
+ {
.procname = "panic",
.data = &panic_timeout,
.maxlen = sizeof(int),
@@ -133,6 +183,16 @@ static const struct ctl_table kern_panic_table[] = {
.mode = 0644,
.proc_handler = proc_douintvec,
},
+#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
+ defined(CONFIG_DEBUG_STACKOVERFLOW)
+ {
+ .procname = "panic_on_stackoverflow",
+ .data = &sysctl_panic_on_stackoverflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
};
static __init int kernel_panic_sysctls_init(void)
diff --git a/kernel/pid.c b/kernel/pid.c
index 8317bcbc7cf7..c45a28c16cd2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -100,7 +100,7 @@ void put_pid(struct pid *pid)
ns = pid->numbers[pid->level].ns;
if (refcount_dec_and_test(&pid->count)) {
- WARN_ON_ONCE(pid->stashed);
+ pidfs_free_pid(pid);
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -713,6 +713,29 @@ static struct ctl_table_root pid_table_root = {
.set_ownership = pid_table_root_set_ownership,
};
+static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct pid *new_pid;
+ pid_t tmp_pid;
+ int r;
+ struct ctl_table tmp_table = *table;
+
+ tmp_pid = pid_vnr(cad_pid);
+ tmp_table.data = &tmp_pid;
+
+ r = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+ if (r || !write)
+ return r;
+
+ new_pid = find_get_pid(tmp_pid);
+ if (!new_pid)
+ return -ESRCH;
+
+ put_pid(xchg(&cad_pid, new_pid));
+ return 0;
+}
+
static const struct ctl_table pid_table[] = {
{
.procname = "pid_max",
@@ -723,6 +746,14 @@ static const struct ctl_table pid_table[] = {
.extra1 = &pid_max_min,
.extra2 = &pid_max_max,
},
+#ifdef CONFIG_PROC_SYSCTL
+ {
+ .procname = "cad_pid",
+ .maxlen = sizeof(int),
+ .mode = 0600,
+ .proc_handler = proc_do_cad_pid,
+ },
+#endif
};
#endif
diff --git a/kernel/power/console.c b/kernel/power/console.c
index fcdf0e14a47d..19c48aa5355d 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -16,6 +16,7 @@
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
static int orig_fgconsole, orig_kmsg;
+static bool vt_switch_done;
static DEFINE_MUTEX(vt_switch_mutex);
@@ -136,17 +137,21 @@ void pm_prepare_console(void)
if (orig_fgconsole < 0)
return;
+ vt_switch_done = true;
+
orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
return;
}
void pm_restore_console(void)
{
- if (!pm_vt_switch())
+ if (!pm_vt_switch() && !vt_switch_done)
return;
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
vt_kmsg_redirect(orig_kmsg);
}
+
+ vt_switch_done = false;
}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 519fb09de5e0..9216e3b91d3b 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -423,7 +423,6 @@ int hibernation_snapshot(int platform_mode)
}
console_suspend_all();
- pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -559,7 +558,6 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
console_suspend_all();
- pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
@@ -571,7 +569,6 @@ int hibernation_restore(int platform_mode)
BUG_ON(!error);
}
dpm_resume_end(PMSG_RECOVER);
- pm_restore_gfp_mask();
console_resume_all();
pm_restore_console();
return error;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3d484630505a..3cf2d7e72567 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
#include <linux/acpi.h>
#include <linux/export.h>
+#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/pm-trace.h>
@@ -112,6 +113,14 @@ int pm_notifier_call_chain(unsigned long val)
/* If set, devices may be suspended and resumed asynchronously. */
int pm_async_enabled = 1;
+static int __init pm_async_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ pm_async_enabled = 0;
+ return 1;
+}
+__setup("pm_async=", pm_async_setup);
+
static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
diff --git a/kernel/power/power.h b/kernel/power/power.h
index cb1d71562002..7ccd709af93f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -239,11 +239,6 @@ static inline void suspend_test_finish(const char *label) {}
/* kernel/power/main.c */
extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down);
extern int pm_notifier_call_chain(unsigned long val);
-void pm_restrict_gfp_mask(void);
-void pm_restore_gfp_mask(void);
-#else
-static inline void pm_restrict_gfp_mask(void) {}
-static inline void pm_restore_gfp_mask(void) {}
#endif
#ifdef CONFIG_HIGHMEM
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 2af36cfe35cd..501df0676a61 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1536,7 +1536,7 @@ static unsigned long copy_data_pages(struct memory_bitmap *copy_bm,
memory_bm_position_reset(orig_bm);
memory_bm_position_reset(copy_bm);
copy_pfn = memory_bm_next_pfn(copy_bm);
- for(;;) {
+ for (;;) {
pfn = memory_bm_next_pfn(orig_bm);
if (unlikely(pfn == BM_END_OF_MAP))
break;
@@ -2161,13 +2161,13 @@ static const char *check_image_kernel(struct swsusp_info *info)
{
if (info->version_code != LINUX_VERSION_CODE)
return "kernel version";
- if (strcmp(info->uts.sysname,init_utsname()->sysname))
+ if (strcmp(info->uts.sysname, init_utsname()->sysname))
return "system type";
- if (strcmp(info->uts.release,init_utsname()->release))
+ if (strcmp(info->uts.release, init_utsname()->release))
return "kernel release";
- if (strcmp(info->uts.version,init_utsname()->version))
+ if (strcmp(info->uts.version, init_utsname()->version))
return "version";
- if (strcmp(info->uts.machine,init_utsname()->machine))
+ if (strcmp(info->uts.machine, init_utsname()->machine))
return "machine";
return NULL;
}
@@ -2361,7 +2361,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm,
struct memory_bitmap *zero_bm)
{
unsigned long decoded_pfn;
- bool zero;
+ bool zero;
int j;
for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 76b141b9aac0..b4ca17c2fecf 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -384,6 +384,7 @@ static int suspend_prepare(suspend_state_t state)
return 0;
dpm_save_failed_step(SUSPEND_FREEZE);
+ filesystems_thaw();
pm_notifier_call_chain(PM_POST_SUSPEND);
Restore:
pm_restore_console();
@@ -592,8 +593,6 @@ static int enter_state(suspend_state_t state)
ksys_sync_helper();
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
}
- if (filesystem_freeze_enabled)
- filesystems_freeze();
pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
pm_suspend_clear_flags();
@@ -606,16 +605,13 @@ static int enter_state(suspend_state_t state)
trace_suspend_resume(TPS("suspend_enter"), state, false);
pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]);
- pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
- pm_restore_gfp_mask();
Finish:
events_check_enabled = false;
pm_pr_dbg("Finishing wakeup.\n");
suspend_finish();
Unlock:
- filesystems_thaw();
mutex_unlock(&system_transition_mutex);
return error;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e8a4b720d7d2..14d4499c6fc3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3072,6 +3072,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
+ /* Avoid NULL dereference if callback is NULL. */
+ if (WARN_ON_ONCE(!func))
+ return;
+
if (debug_rcu_head_queue(head)) {
/*
* Probable double call_rcu(), so leak the callback.
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 486c00536207..69482c2f0771 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -17,8 +17,37 @@
// Controlling CPU stall warnings, including delay calculation.
/* panic() on RCU Stall sysctl. */
-int sysctl_panic_on_rcu_stall __read_mostly;
-int sysctl_max_rcu_stall_to_panic __read_mostly;
+static int sysctl_panic_on_rcu_stall __read_mostly;
+static int sysctl_max_rcu_stall_to_panic __read_mostly;
+
+static const struct ctl_table rcu_stall_sysctl_table[] = {
+ {
+ .procname = "panic_on_rcu_stall",
+ .data = &sysctl_panic_on_rcu_stall,
+ .maxlen = sizeof(sysctl_panic_on_rcu_stall),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "max_rcu_stall_to_panic",
+ .data = &sysctl_max_rcu_stall_to_panic,
+ .maxlen = sizeof(sysctl_max_rcu_stall_to_panic),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+};
+
+static int __init init_rcu_stall_sysctl(void)
+{
+ register_sysctl_init("kernel", rcu_stall_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_rcu_stall_sysctl);
#ifdef CONFIG_SYSFS
diff --git a/kernel/resource.c b/kernel/resource.c
index 8d3e6ed0bdc1..f9bb5481501a 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1279,8 +1279,9 @@ static int __request_region_locked(struct resource *res, struct resource *parent
* become unavailable to other users. Conflicts are
* not expected. Warn to aid debugging if encountered.
*/
- if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
- pr_warn("Unaddressable device %s %pR conflicts with %pR",
+ if (parent == &iomem_resource &&
+ conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
+ pr_warn("Unaddressable device %s %pR conflicts with %pR\n",
conflict->name, conflict, res);
}
if (conflict != parent) {
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2b331822c7e7..cdea931aae30 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -4,6 +4,9 @@
* Auto-group scheduling implementation:
*/
+#include "autogroup.h"
+#include "sched.h"
+
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
@@ -25,9 +28,9 @@ static void __init sched_autogroup_sysctl_init(void)
{
register_sysctl_init("kernel", sched_autogroup_sysctls);
}
-#else
+#else /* !CONFIG_SYSCTL: */
#define sched_autogroup_sysctl_init() do { } while (0)
-#endif
+#endif /* !CONFIG_SYSCTL */
void __init autogroup_init(struct task_struct *init_task)
{
@@ -108,7 +111,7 @@ static inline struct autogroup *autogroup_create(void)
free_rt_sched_group(tg);
tg->rt_se = root_task_group.rt_se;
tg->rt_rq = root_task_group.rt_rq;
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
tg->autogroup = ag;
sched_online_group(tg, &root_task_group);
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 90d69f2c5eaf..06c82b2bdfb5 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -2,6 +2,8 @@
#ifndef _KERNEL_SCHED_AUTOGROUP_H
#define _KERNEL_SCHED_AUTOGROUP_H
+#include "sched.h"
+
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
@@ -41,7 +43,7 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
-#else /* !CONFIG_SCHED_AUTOGROUP */
+#else /* !CONFIG_SCHED_AUTOGROUP: */
static inline void autogroup_init(struct task_struct *init_task) { }
static inline void autogroup_free(struct task_group *tg) { }
@@ -61,6 +63,6 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
return 0;
}
-#endif /* CONFIG_SCHED_AUTOGROUP */
+#endif /* !CONFIG_SCHED_AUTOGROUP */
#endif /* _KERNEL_SCHED_AUTOGROUP_H */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 72d97aa8b726..c4a488e67aa7 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -50,11 +50,9 @@
#include "idle.c"
#include "rt.c"
+#include "cpudeadline.c"
-#ifdef CONFIG_SMP
-# include "cpudeadline.c"
-# include "pelt.c"
-#endif
+#include "pelt.c"
#include "cputime.c"
#include "deadline.c"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index bf9d8db94b70..e2cf3b08d4e9 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -80,11 +80,10 @@
#include "wait_bit.c"
#include "wait.c"
-#ifdef CONFIG_SMP
-# include "cpupri.c"
-# include "stop_task.c"
-# include "topology.c"
-#endif
+#include "cpupri.c"
+#include "stop_task.c"
+
+#include "topology.c"
#ifdef CONFIG_SCHED_CORE
# include "core_sched.c"
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index a09655b48140..f5e6dd6a6b3a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -54,6 +54,9 @@
*
*/
+#include <linux/sched/clock.h>
+#include "sched.h"
+
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
@@ -471,7 +474,7 @@ notrace void sched_clock_idle_wakeup_event(void)
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */
void __init sched_clock_init(void)
{
@@ -489,7 +492,7 @@ notrace u64 sched_clock_cpu(int cpu)
return sched_clock();
}
-#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
/*
* Running clock - returns the time that has elapsed while a guest has been
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 3561ab533dd4..19ee702273c0 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -13,6 +13,11 @@
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
+#include <linux/linkage.h>
+#include <linux/sched/debug.h>
+#include <linux/completion.h>
+#include "sched.h"
+
static void complete_with_flags(struct completion *x, int wake_flags)
{
unsigned long flags;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dce50fa57471..3ec00d08d46a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -69,8 +69,8 @@
#include <linux/livepatch_sched.h>
#ifdef CONFIG_PREEMPT_DYNAMIC
-# ifdef CONFIG_GENERIC_ENTRY
-# include <linux/entry-common.h>
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+# include <linux/irq-entry-common.h>
# endif
#endif
@@ -96,6 +96,7 @@
#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
+#include "../locking/mutex.h"
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
@@ -119,6 +120,35 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_SCHED_PROXY_EXEC
+DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
+static int __init setup_proxy_exec(char *str)
+{
+ bool proxy_enable = true;
+
+ if (*str && kstrtobool(str + 1, &proxy_enable)) {
+ pr_warn("Unable to parse sched_proxy_exec=\n");
+ return 0;
+ }
+
+ if (proxy_enable) {
+ pr_info("sched_proxy_exec enabled via boot arg\n");
+ static_branch_enable(&__sched_proxy_exec);
+ } else {
+ pr_info("sched_proxy_exec disabled via boot arg\n");
+ static_branch_disable(&__sched_proxy_exec);
+ }
+ return 1;
+}
+#else
+static int __init setup_proxy_exec(char *str)
+{
+ pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n");
+ return 0;
+}
+#endif
+__setup("sched_proxy_exec", setup_proxy_exec);
+
/*
* Debugging: various feature bits
*
@@ -481,13 +511,13 @@ void sched_core_put(void)
schedule_work(&_work);
}
-#else /* !CONFIG_SCHED_CORE */
+#else /* !CONFIG_SCHED_CORE: */
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
static inline void
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
-#endif /* CONFIG_SCHED_CORE */
+#endif /* !CONFIG_SCHED_CORE */
/* need a wrapper since we may need to trace from modules */
EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
@@ -650,7 +680,6 @@ void raw_spin_rq_unlock(struct rq *rq)
raw_spin_unlock(rq_lockp(rq));
}
-#ifdef CONFIG_SMP
/*
* double_rq_lock - safely lock two runqueues
*/
@@ -667,7 +696,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
double_rq_clock_clear_update(rq1, rq2);
}
-#endif
/*
* __task_rq_lock - lock the rq @p resides on.
@@ -853,8 +881,6 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-#ifdef CONFIG_SMP
-
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
@@ -899,33 +925,12 @@ void hrtick_start(struct rq *rq, u64 delay)
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
}
-#else
-/*
- * Called to set the hrtick timer state.
- *
- * called with rq->lock held and IRQs disabled
- */
-void hrtick_start(struct rq *rq, u64 delay)
-{
- /*
- * Don't schedule slices shorter than 10000ns, that just
- * doesn't make sense. Rely on vruntime for fairness.
- */
- delay = max_t(u64, delay, 10000LL);
- hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
- HRTIMER_MODE_REL_PINNED_HARD);
-}
-
-#endif /* CONFIG_SMP */
-
static void hrtick_rq_init(struct rq *rq)
{
-#ifdef CONFIG_SMP
INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
-#endif
hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
-#else /* CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static inline void hrtick_clear(struct rq *rq)
{
}
@@ -933,7 +938,7 @@ static inline void hrtick_clear(struct rq *rq)
static inline void hrtick_rq_init(struct rq *rq)
{
}
-#endif /* CONFIG_SCHED_HRTICK */
+#endif /* !CONFIG_SCHED_HRTICK */
/*
* try_cmpxchg based fetch_or() macro so it works for different integer types:
@@ -949,7 +954,7 @@ static inline void hrtick_rq_init(struct rq *rq)
_val; \
})
-#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+#ifdef TIF_POLLING_NRFLAG
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
@@ -988,13 +993,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
return true;
}
-#ifdef CONFIG_SMP
static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
#endif
-#endif
static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
@@ -1167,7 +1170,6 @@ void resched_cpu(int cpu)
raw_spin_rq_unlock_irqrestore(rq, flags);
}
-#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy CPU for migrating timers
@@ -1374,10 +1376,8 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
-#endif /* CONFIG_SMP */
-#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
- (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED)
/*
* Iterate task_group tree rooted at *from, calling @down when first entering a
* node and @up when leaving it for the final time.
@@ -1971,7 +1971,7 @@ undo:
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
return result;
}
-#endif
+#endif /* CONFIG_SYSCTL */
static void uclamp_fork(struct task_struct *p)
{
@@ -2037,13 +2037,13 @@ static void __init init_uclamp(void)
}
}
-#else /* !CONFIG_UCLAMP_TASK */
+#else /* !CONFIG_UCLAMP_TASK: */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
-#endif /* CONFIG_UCLAMP_TASK */
+#endif /* !CONFIG_UCLAMP_TASK */
bool sched_task_on_rq(struct task_struct *p)
{
@@ -2353,8 +2353,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
return ncsw;
}
-#ifdef CONFIG_SMP
-
static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
@@ -2936,8 +2934,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
struct set_affinity_pending my_pending = { }, *pending = NULL;
bool stop_pending, complete = false;
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
+ /*
+ * Can the task run on the task's current CPU? If so, we're done
+ *
+ * We are also done if the task is the current donor, boosting a lock-
+ * holding proxy, (and potentially has been migrated outside its
+ * current or previous affinity mask)
+ */
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) ||
+ (task_current_donor(rq, p) && !task_current(rq, p))) {
struct task_struct *push_task = NULL;
if ((flags & SCA_MIGRATE_ENABLE) &&
@@ -3305,6 +3310,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
WARN_ON_ONCE(ret);
}
+#ifdef CONFIG_SMP
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
unsigned int state = READ_ONCE(p->__state);
@@ -3358,14 +3365,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
__set_task_cpu(p, new_cpu);
}
+#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
- __schedstat_inc(p->stats.numa_task_swapped);
- count_vm_numa_event(NUMA_TASK_SWAP);
- count_memcg_event_mm(p->mm, NUMA_TASK_SWAP);
-
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
@@ -3661,17 +3665,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
}
}
-#else /* CONFIG_SMP */
-
-static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
-
-static inline bool rq_has_pinned_tasks(struct rq *rq)
-{
- return false;
-}
-
-#endif /* !CONFIG_SMP */
-
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
@@ -3682,7 +3675,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
rq = this_rq();
-#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
__schedstat_inc(p->stats.nr_wakeups_local);
@@ -3702,7 +3694,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
if (wake_flags & WF_MIGRATED)
__schedstat_inc(p->stats.nr_wakeups_migrate);
-#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
__schedstat_inc(p->stats.nr_wakeups);
@@ -3731,13 +3722,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
-#ifdef CONFIG_SMP
if (wake_flags & WF_RQ_SELECTED)
en_flags |= ENQUEUE_RQ_SELECTED;
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
else
-#endif
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
@@ -3748,7 +3737,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
ttwu_do_wakeup(p);
-#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Our task @p is fully woken up and running; so it's safe to
@@ -3770,7 +3758,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
rq->idle_stamp = 0;
}
-#endif
}
/*
@@ -3824,7 +3811,6 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
return ret;
}
-#ifdef CONFIG_SMP
void sched_ttwu_pending(void *arg)
{
struct llist_node *llist = arg;
@@ -3891,7 +3877,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
WRITE_ONCE(rq->ttwu_pending, 1);
+#ifdef CONFIG_SMP
__smp_call_single_queue(cpu, &p->wake_entry.llist);
+#endif
}
void wake_up_if_idle(int cpu)
@@ -3943,6 +3931,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
if (!scx_allow_ttwu_queue(p))
return false;
+#ifdef CONFIG_SMP
+ if (p->sched_class == &stop_sched_class)
+ return false;
+#endif
+
/*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
@@ -3992,15 +3985,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
return false;
}
-#else /* !CONFIG_SMP */
-
-static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
-{
- return false;
-}
-
-#endif /* CONFIG_SMP */
-
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
@@ -4256,7 +4240,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
break;
-#ifdef CONFIG_SMP
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
@@ -4335,9 +4318,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
-#else
- cpu = task_cpu(p);
-#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
}
@@ -4370,14 +4350,12 @@ static bool __task_needs_rq_lock(struct task_struct *p)
if (p->on_rq)
return true;
-#ifdef CONFIG_SMP
/*
* Ensure the task has finished __schedule() and will not be referenced
* anymore. Again, see try_to_wake_up() for a longer comment.
*/
smp_rmb();
smp_cond_load_acquire(&p->on_cpu, !VAL);
-#endif
return false;
}
@@ -4533,10 +4511,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->capture_control = NULL;
#endif
init_numa_balancing(clone_flags, p);
-#ifdef CONFIG_SMP
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
-#endif
init_sched_mm_cid(p);
}
@@ -4599,8 +4575,8 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write,
}
return err;
}
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SCHEDSTATS
@@ -4787,14 +4763,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
-#if defined(CONFIG_SMP)
p->on_cpu = 0;
-#endif
init_task_preempt_count(p);
-#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
RB_CLEAR_NODE(&p->pushable_dl_tasks);
-#endif
+
return 0;
}
@@ -4871,7 +4844,6 @@ void wake_up_new_task(struct task_struct *p)
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
-#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_ptr can change in the fork path
@@ -4883,7 +4855,6 @@ void wake_up_new_task(struct task_struct *p)
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
-#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(p);
@@ -4891,7 +4862,6 @@ void wake_up_new_task(struct task_struct *p)
activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL);
trace_sched_wakeup_new(p);
wakeup_preempt(rq, p, wake_flags);
-#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Nothing relies on rq->lock after this, so it's fine to
@@ -4901,7 +4871,6 @@ void wake_up_new_task(struct task_struct *p)
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
-#endif
task_rq_unlock(rq, p, &rf);
}
@@ -4978,7 +4947,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
__fire_sched_out_preempt_notifiers(curr, next);
}
-#else /* !CONFIG_PREEMPT_NOTIFIERS */
+#else /* !CONFIG_PREEMPT_NOTIFIERS: */
static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
@@ -4990,11 +4959,10 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
{
}
-#endif /* CONFIG_PREEMPT_NOTIFIERS */
+#endif /* !CONFIG_PREEMPT_NOTIFIERS */
static inline void prepare_task(struct task_struct *next)
{
-#ifdef CONFIG_SMP
/*
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
@@ -5003,12 +4971,10 @@ static inline void prepare_task(struct task_struct *next)
* its ordering comment.
*/
WRITE_ONCE(next->on_cpu, 1);
-#endif
}
static inline void finish_task(struct task_struct *prev)
{
-#ifdef CONFIG_SMP
/*
* This must be the very last reference to @prev from this CPU. After
* p->on_cpu is cleared, the task can be moved to a different CPU. We
@@ -5021,11 +4987,8 @@ static inline void finish_task(struct task_struct *prev)
* Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
-#endif
}
-#ifdef CONFIG_SMP
-
static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
{
void (*func)(struct rq *rq);
@@ -5107,14 +5070,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head)
}
}
-#else
-
-static inline void __balance_callbacks(struct rq *rq)
-{
-}
-
-#endif
-
static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
{
@@ -5502,8 +5457,6 @@ unsigned int nr_iowait(void)
return sum;
}
-#ifdef CONFIG_SMP
-
/*
* sched_exec - execve() is a valuable balancing opportunity, because at
* this point the task has the smallest effective memory and cache footprint.
@@ -5527,8 +5480,6 @@ void sched_exec(void)
stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}
-#endif
-
DEFINE_PER_CPU(struct kernel_stat, kstat);
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
@@ -5563,7 +5514,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
struct rq *rq;
u64 ns;
-#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+#ifdef CONFIG_64BIT
/*
* 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
@@ -5690,12 +5641,10 @@ void sched_tick(void)
if (donor->flags & PF_WQ_WORKER)
wq_worker_tick(donor);
-#ifdef CONFIG_SMP
if (!scx_switched_all()) {
rq->idle_balance = idle_cpu(cpu);
sched_balance_trigger(rq);
}
-#endif
}
#ifdef CONFIG_NO_HZ_FULL
@@ -5835,10 +5784,10 @@ int __init sched_tick_offload_init(void)
return 0;
}
-#else /* !CONFIG_NO_HZ_FULL */
+#else /* !CONFIG_NO_HZ_FULL: */
static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
-#endif
+#endif /* !CONFIG_NO_HZ_FULL */
#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
@@ -6553,7 +6502,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu)
rq->core = rq;
}
-#else /* !CONFIG_SCHED_CORE */
+#else /* !CONFIG_SCHED_CORE: */
static inline void sched_core_cpu_starting(unsigned int cpu) {}
static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
@@ -6565,7 +6514,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return __pick_next_task(rq, prev, rf);
}
-#endif /* CONFIG_SCHED_CORE */
+#endif /* !CONFIG_SCHED_CORE */
/*
* Constants for the sched_mode argument of __schedule().
@@ -6581,11 +6530,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/*
* Helper function for __schedule()
*
- * If a task does not have signals pending, deactivate it
- * Otherwise marks the task's __state as RUNNING
+ * Tries to deactivate the task, unless the should_block arg
+ * is false or if a signal is pending. In the case a signal
+ * is pending, marks the task's __state as RUNNING (and clear
+ * blocked_on).
*/
static bool try_to_block_task(struct rq *rq, struct task_struct *p,
- unsigned long *task_state_p)
+ unsigned long *task_state_p, bool should_block)
{
unsigned long task_state = *task_state_p;
int flags = DEQUEUE_NOCLOCK;
@@ -6596,6 +6547,16 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
return false;
}
+ /*
+ * We check should_block after signal_pending because we
+ * will want to wake the task in that case. But if
+ * should_block is false, its likely due to the task being
+ * blocked on a mutex, and we want to keep it on the runqueue
+ * to be selectable for proxy-execution.
+ */
+ if (!should_block)
+ return false;
+
p->sched_contributes_to_load =
(task_state & TASK_UNINTERRUPTIBLE) &&
!(task_state & TASK_NOLOAD) &&
@@ -6619,6 +6580,194 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
return true;
}
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline struct task_struct *proxy_resched_idle(struct rq *rq)
+{
+ put_prev_set_next_task(rq, rq->donor, rq->idle);
+ rq_set_donor(rq, rq->idle);
+ set_tsk_need_resched(rq->idle);
+ return rq->idle;
+}
+
+static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
+{
+ unsigned long state = READ_ONCE(donor->__state);
+
+ /* Don't deactivate if the state has been changed to TASK_RUNNING */
+ if (state == TASK_RUNNING)
+ return false;
+ /*
+ * Because we got donor from pick_next_task(), it is *crucial*
+ * that we call proxy_resched_idle() before we deactivate it.
+ * As once we deactivate donor, donor->on_rq is set to zero,
+ * which allows ttwu() to immediately try to wake the task on
+ * another rq. So we cannot use *any* references to donor
+ * after that point. So things like cfs_rq->curr or rq->donor
+ * need to be changed from next *before* we deactivate.
+ */
+ proxy_resched_idle(rq);
+ return try_to_block_task(rq, donor, &state, true);
+}
+
+static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
+{
+ if (!__proxy_deactivate(rq, donor)) {
+ /*
+ * XXX: For now, if deactivation failed, set donor
+ * as unblocked, as we aren't doing proxy-migrations
+ * yet (more logic will be needed then).
+ */
+ donor->blocked_on = NULL;
+ }
+ return NULL;
+}
+
+/*
+ * Find runnable lock owner to proxy for mutex blocked donor
+ *
+ * Follow the blocked-on relation:
+ * task->blocked_on -> mutex->owner -> task...
+ *
+ * Lock order:
+ *
+ * p->pi_lock
+ * rq->lock
+ * mutex->wait_lock
+ *
+ * Returns the task that is going to be used as execution context (the one
+ * that is actually going to be run on cpu_of(rq)).
+ */
+static struct task_struct *
+find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+{
+ struct task_struct *owner = NULL;
+ int this_cpu = cpu_of(rq);
+ struct task_struct *p;
+ struct mutex *mutex;
+
+ /* Follow blocked_on chain. */
+ for (p = donor; task_is_blocked(p); p = owner) {
+ mutex = p->blocked_on;
+ /* Something changed in the chain, so pick again */
+ if (!mutex)
+ return NULL;
+ /*
+ * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
+ * and ensure @owner sticks around.
+ */
+ guard(raw_spinlock)(&mutex->wait_lock);
+
+ /* Check again that p is blocked with wait_lock held */
+ if (mutex != __get_task_blocked_on(p)) {
+ /*
+ * Something changed in the blocked_on chain and
+ * we don't know if only at this level. So, let's
+ * just bail out completely and let __schedule()
+ * figure things out (pick_again loop).
+ */
+ return NULL;
+ }
+
+ owner = __mutex_owner(mutex);
+ if (!owner) {
+ __clear_task_blocked_on(p, mutex);
+ return p;
+ }
+
+ if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
+ /* XXX Don't handle blocked owners/delayed dequeue yet */
+ return proxy_deactivate(rq, donor);
+ }
+
+ if (task_cpu(owner) != this_cpu) {
+ /* XXX Don't handle migrations yet */
+ return proxy_deactivate(rq, donor);
+ }
+
+ if (task_on_rq_migrating(owner)) {
+ /*
+ * One of the chain of mutex owners is currently migrating to this
+ * CPU, but has not yet been enqueued because we are holding the
+ * rq lock. As a simple solution, just schedule rq->idle to give
+ * the migration a chance to complete. Much like the migrate_task
+ * case we should end up back in find_proxy_task(), this time
+ * hopefully with all relevant tasks already enqueued.
+ */
+ return proxy_resched_idle(rq);
+ }
+
+ /*
+ * Its possible to race where after we check owner->on_rq
+ * but before we check (owner_cpu != this_cpu) that the
+ * task on another cpu was migrated back to this cpu. In
+ * that case it could slip by our checks. So double check
+ * we are still on this cpu and not migrating. If we get
+ * inconsistent results, try again.
+ */
+ if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu)
+ return NULL;
+
+ if (owner == p) {
+ /*
+ * It's possible we interleave with mutex_unlock like:
+ *
+ * lock(&rq->lock);
+ * find_proxy_task()
+ * mutex_unlock()
+ * lock(&wait_lock);
+ * donor(owner) = current->blocked_donor;
+ * unlock(&wait_lock);
+ *
+ * wake_up_q();
+ * ...
+ * ttwu_runnable()
+ * __task_rq_lock()
+ * lock(&wait_lock);
+ * owner == p
+ *
+ * Which leaves us to finish the ttwu_runnable() and make it go.
+ *
+ * So schedule rq->idle so that ttwu_runnable() can get the rq
+ * lock and mark owner as running.
+ */
+ return proxy_resched_idle(rq);
+ }
+ /*
+ * OK, now we're absolutely sure @owner is on this
+ * rq, therefore holding @rq->lock is sufficient to
+ * guarantee its existence, as per ttwu_remote().
+ */
+ }
+
+ WARN_ON_ONCE(owner && !owner->on_rq);
+ return owner;
+}
+#else /* SCHED_PROXY_EXEC */
+static struct task_struct *
+find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+{
+ WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n");
+ return donor;
+}
+#endif /* SCHED_PROXY_EXEC */
+
+static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
+{
+ if (!sched_proxy_exec())
+ return;
+ /*
+ * pick_next_task() calls set_next_task() on the chosen task
+ * at some point, which ensures it is not push/pullable.
+ * However, the chosen/donor task *and* the mutex owner form an
+ * atomic pair wrt push/pull.
+ *
+ * Make sure owner we run is not pushable. Unfortunately we can
+ * only deal with that by means of a dequeue/enqueue cycle. :-/
+ */
+ dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
+ enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
+}
+
/*
* __schedule() is the main scheduler function.
*
@@ -6731,15 +6880,31 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- try_to_block_task(rq, prev, &prev_state);
+ /*
+ * We pass task_is_blocked() as the should_block arg
+ * in order to keep mutex-blocked tasks on the runqueue
+ * for slection with proxy-exec (without proxy-exec
+ * task_is_blocked() will always be false).
+ */
+ try_to_block_task(rq, prev, &prev_state,
+ !task_is_blocked(prev));
switch_count = &prev->nvcsw;
}
- next = pick_next_task(rq, prev, &rf);
+pick_again:
+ next = pick_next_task(rq, rq->donor, &rf);
rq_set_donor(rq, next);
+ if (unlikely(task_is_blocked(next))) {
+ next = find_proxy_task(rq, next, &rf);
+ if (!next)
+ goto pick_again;
+ if (next == rq->idle)
+ goto keep_resched;
+ }
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
+keep_resched:
rq->last_seen_need_resched_ns = 0;
is_switch = prev != next;
@@ -6750,6 +6915,10 @@ picked:
* changes to task_struct made by pick_next_task().
*/
RCU_INIT_POINTER(rq->curr, next);
+
+ if (!task_current_donor(rq, next))
+ proxy_tag_curr(rq, next);
+
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
@@ -6784,6 +6953,10 @@ picked:
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
+ /* In case next was already curr but just got blocked_donor */
+ if (!task_current_donor(rq, next))
+ proxy_tag_curr(rq, next);
+
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
@@ -6992,14 +7165,14 @@ NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#ifndef preempt_schedule_dynamic_enabled
-#define preempt_schedule_dynamic_enabled preempt_schedule
-#define preempt_schedule_dynamic_disabled NULL
-#endif
+# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+# ifndef preempt_schedule_dynamic_enabled
+# define preempt_schedule_dynamic_enabled preempt_schedule
+# define preempt_schedule_dynamic_disabled NULL
+# endif
DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
void __sched notrace dynamic_preempt_schedule(void)
{
@@ -7009,8 +7182,8 @@ void __sched notrace dynamic_preempt_schedule(void)
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule);
EXPORT_SYMBOL(dynamic_preempt_schedule);
-#endif
-#endif
+# endif
+#endif /* CONFIG_PREEMPT_DYNAMIC */
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
@@ -7065,14 +7238,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#ifndef preempt_schedule_notrace_dynamic_enabled
-#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
-#define preempt_schedule_notrace_dynamic_disabled NULL
-#endif
+# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+# ifndef preempt_schedule_notrace_dynamic_enabled
+# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+# define preempt_schedule_notrace_dynamic_disabled NULL
+# endif
DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
void __sched notrace dynamic_preempt_schedule_notrace(void)
{
@@ -7082,7 +7255,7 @@ void __sched notrace dynamic_preempt_schedule_notrace(void)
}
NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
-#endif
+# endif
#endif
#endif /* CONFIG_PREEMPTION */
@@ -7301,7 +7474,7 @@ out_unlock:
preempt_enable();
}
-#endif
+#endif /* CONFIG_RT_MUTEXES */
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
int __sched __cond_resched(void)
@@ -7332,17 +7505,17 @@ EXPORT_SYMBOL(__cond_resched);
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define cond_resched_dynamic_enabled __cond_resched
-#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
+# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+# define cond_resched_dynamic_enabled __cond_resched
+# define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);
-#define might_resched_dynamic_enabled __cond_resched
-#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
+# define might_resched_dynamic_enabled __cond_resched
+# define might_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
@@ -7360,8 +7533,8 @@ int __sched dynamic_might_resched(void)
return __cond_resched();
}
EXPORT_SYMBOL(dynamic_might_resched);
-#endif
-#endif
+# endif
+#endif /* CONFIG_PREEMPT_DYNAMIC */
/*
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -7427,9 +7600,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
#ifdef CONFIG_PREEMPT_DYNAMIC
-#ifdef CONFIG_GENERIC_ENTRY
-#include <linux/entry-common.h>
-#endif
+# ifdef CONFIG_GENERIC_IRQ_ENTRY
+# include <linux/irq-entry-common.h>
+# endif
/*
* SC:cond_resched
@@ -7484,37 +7657,37 @@ int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
-#ifndef CONFIG_PREEMPT_RT
+# ifndef CONFIG_PREEMPT_RT
if (!strcmp(str, "none"))
return preempt_dynamic_none;
if (!strcmp(str, "voluntary"))
return preempt_dynamic_voluntary;
-#endif
+# endif
if (!strcmp(str, "full"))
return preempt_dynamic_full;
-#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
if (!strcmp(str, "lazy"))
return preempt_dynamic_lazy;
-#endif
+# endif
return -EINVAL;
}
-#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
+# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
+# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
-#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
-#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
-#else
-#error "Unsupported PREEMPT_DYNAMIC mechanism"
-#endif
+# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
+# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
+# else
+# error "Unsupported PREEMPT_DYNAMIC mechanism"
+# endif
static DEFINE_MUTEX(sched_dynamic_mutex);
@@ -7618,7 +7791,7 @@ static void __init preempt_dynamic_init(void)
}
}
-#define PREEMPT_MODEL_ACCESSOR(mode) \
+# define PREEMPT_MODEL_ACCESSOR(mode) \
bool preempt_model_##mode(void) \
{ \
WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
@@ -7663,7 +7836,7 @@ const char *preempt_model_str(void)
if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) {
seq_buf_printf(&s, "(%s)%s",
- preempt_dynamic_mode > 0 ?
+ preempt_dynamic_mode >= 0 ?
preempt_modes[preempt_dynamic_mode] : "undef",
brace ? "}" : "");
return seq_buf_str(&s);
@@ -7819,12 +7992,10 @@ void show_state_filter(unsigned int state_filter)
*/
void __init init_idle(struct task_struct *idle, int cpu)
{
-#ifdef CONFIG_SMP
struct affinity_context ac = (struct affinity_context) {
.new_mask = cpumask_of(cpu),
.flags = 0,
};
-#endif
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -7840,13 +8011,11 @@ void __init init_idle(struct task_struct *idle, int cpu)
idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
kthread_set_per_cpu(idle, cpu);
-#ifdef CONFIG_SMP
/*
* No validation and serialization required at boot time and for
* setting up the idle tasks of not yet online CPUs.
*/
set_cpus_allowed_common(idle, &ac);
-#endif
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the CPU isn't yet set to this CPU so the
@@ -7865,9 +8034,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
rq_set_donor(rq, idle);
rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
-#ifdef CONFIG_SMP
idle->on_cpu = 1;
-#endif
raw_spin_rq_unlock(rq);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
@@ -7880,13 +8047,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
-#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
-#endif
}
-#ifdef CONFIG_SMP
-
int cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
@@ -7934,9 +8097,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
- __schedstat_inc(p->stats.numa_task_migrated);
- count_vm_numa_event(NUMA_TASK_MIGRATE);
- count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE);
+ /* TODO: This is not properly updating schedstats */
+
trace_sched_move_numa(p, curr_cpu, target_cpu);
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
}
@@ -8123,7 +8285,7 @@ static void balance_hotplug_wait(void)
TASK_UNINTERRUPTIBLE);
}
-#else
+#else /* !CONFIG_HOTPLUG_CPU: */
static inline void balance_push(struct rq *rq)
{
@@ -8137,7 +8299,7 @@ static inline void balance_hotplug_wait(void)
{
}
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif /* !CONFIG_HOTPLUG_CPU */
void set_rq_online(struct rq *rq)
{
@@ -8446,7 +8608,7 @@ int sched_cpu_dying(unsigned int cpu)
sched_core_cpu_dying(cpu);
return 0;
}
-#endif
+#endif /* CONFIG_HOTPLUG_CPU */
void __init sched_init_smp(void)
{
@@ -8470,6 +8632,8 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
+ sched_init_dl_servers();
+
sched_smp_initialized = true;
}
@@ -8480,13 +8644,6 @@ static int __init migration_init(void)
}
early_initcall(migration_init);
-#else
-void __init sched_init_smp(void)
-{
- sched_init_granularity();
-}
-#endif /* CONFIG_SMP */
-
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -8512,9 +8669,7 @@ void __init sched_init(void)
int i;
/* Make sure the linker didn't screw up */
-#ifdef CONFIG_SMP
BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
-#endif
BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
@@ -8545,7 +8700,7 @@ void __init sched_init(void)
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_EXT_GROUP_SCHED
- root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+ scx_tg_init(&root_task_group);
#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -8557,9 +8712,7 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
}
-#ifdef CONFIG_SMP
init_defrootdomain();
-#endif
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
@@ -8620,7 +8773,6 @@ void __init sched_init(void)
rq->rt.rt_runtime = global_rt_runtime();
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
-#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = SCHED_CAPACITY_SCALE;
@@ -8646,7 +8798,6 @@ void __init sched_init(void)
#ifdef CONFIG_HOTPLUG_CPU
rcuwait_init(&rq->hotplug_wait);
#endif
-#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(&rq->nr_iowait, 0);
fair_server_init(rq);
@@ -8694,10 +8845,9 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
-#ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+
balance_push_set(smp_processor_id(), false);
-#endif
init_sched_fair_class();
init_sched_ext_class();
@@ -8830,7 +8980,7 @@ void __cant_sleep(const char *file, int line, int preempt_offset)
}
EXPORT_SYMBOL_GPL(__cant_sleep);
-#ifdef CONFIG_SMP
+# ifdef CONFIG_SMP
void __cant_migrate(const char *file, int line)
{
static unsigned long prev_jiffy;
@@ -8861,8 +9011,8 @@ void __cant_migrate(const char *file, int line)
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
EXPORT_SYMBOL_GPL(__cant_migrate);
-#endif
-#endif
+# endif /* CONFIG_SMP */
+#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */
#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)
@@ -8902,7 +9052,7 @@ void normalize_rt_tasks(void)
#endif /* CONFIG_MAGIC_SYSRQ */
-#if defined(CONFIG_KGDB_KDB)
+#ifdef CONFIG_KGDB_KDB
/*
* These functions are only useful for KDB.
*
@@ -8926,7 +9076,7 @@ struct task_struct *curr_task(int cpu)
return cpu_curr(cpu);
}
-#endif /* defined(CONFIG_KGDB_KDB) */
+#endif /* CONFIG_KGDB_KDB */
#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task groups */
@@ -8985,7 +9135,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
- scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
+ scx_tg_init(tg);
alloc_uclamp_sched_group(tg, parent);
return tg;
@@ -9422,47 +9572,23 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
#ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex);
-const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
-/* More than 203 days if BW_SHIFT equals 20. */
-static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
-
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
- u64 burst)
+static int tg_set_cfs_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
{
int i, ret = 0, runtime_enabled, runtime_was_enabled;
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 period, quota, burst;
- if (tg == &root_task_group)
- return -EINVAL;
-
- /*
- * Ensure we have at some amount of bandwidth every period. This is
- * to prevent reaching a state of large arrears when throttled via
- * entity_tick() resulting in prolonged exit starvation.
- */
- if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
- return -EINVAL;
-
- /*
- * Likewise, bound things on the other side by preventing insane quota
- * periods. This also allows us to normalize in computing quota
- * feasibility.
- */
- if (period > max_cfs_quota_period)
- return -EINVAL;
+ period = (u64)period_us * NSEC_PER_USEC;
- /*
- * Bound quota to defend quota against overflow during bandwidth shift.
- */
- if (quota != RUNTIME_INF && quota > max_cfs_runtime)
- return -EINVAL;
+ if (quota_us == RUNTIME_INF)
+ quota = RUNTIME_INF;
+ else
+ quota = (u64)quota_us * NSEC_PER_USEC;
- if (quota != RUNTIME_INF && (burst > quota ||
- burst + quota > max_cfs_runtime))
- return -EINVAL;
+ burst = (u64)burst_us * NSEC_PER_USEC;
/*
* Prevent race between setting of cfs_rq->runtime_enabled and
@@ -9517,28 +9643,22 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
return 0;
}
-static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+static u64 tg_get_cfs_period(struct task_group *tg)
{
- u64 quota, period, burst;
+ u64 cfs_period_us;
- period = ktime_to_ns(tg->cfs_bandwidth.period);
- burst = tg->cfs_bandwidth.burst;
- if (cfs_quota_us < 0)
- quota = RUNTIME_INF;
- else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
- quota = (u64)cfs_quota_us * NSEC_PER_USEC;
- else
- return -EINVAL;
+ cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
+ do_div(cfs_period_us, NSEC_PER_USEC);
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
+ return cfs_period_us;
}
-static long tg_get_cfs_quota(struct task_group *tg)
+static u64 tg_get_cfs_quota(struct task_group *tg)
{
u64 quota_us;
if (tg->cfs_bandwidth.quota == RUNTIME_INF)
- return -1;
+ return RUNTIME_INF;
quota_us = tg->cfs_bandwidth.quota;
do_div(quota_us, NSEC_PER_USEC);
@@ -9546,45 +9666,7 @@ static long tg_get_cfs_quota(struct task_group *tg)
return quota_us;
}
-static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
-{
- u64 quota, period, burst;
-
- if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
- return -EINVAL;
-
- period = (u64)cfs_period_us * NSEC_PER_USEC;
- quota = tg->cfs_bandwidth.quota;
- burst = tg->cfs_bandwidth.burst;
-
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
-}
-
-static long tg_get_cfs_period(struct task_group *tg)
-{
- u64 cfs_period_us;
-
- cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
- do_div(cfs_period_us, NSEC_PER_USEC);
-
- return cfs_period_us;
-}
-
-static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us)
-{
- u64 quota, period, burst;
-
- if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
- return -EINVAL;
-
- burst = (u64)cfs_burst_us * NSEC_PER_USEC;
- period = ktime_to_ns(tg->cfs_bandwidth.period);
- quota = tg->cfs_bandwidth.quota;
-
- return tg_set_cfs_bandwidth(tg, period, quota, burst);
-}
-
-static long tg_get_cfs_burst(struct task_group *tg)
+static u64 tg_get_cfs_burst(struct task_group *tg)
{
u64 burst_us;
@@ -9594,42 +9676,6 @@ static long tg_get_cfs_burst(struct task_group *tg)
return burst_us;
}
-static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_quota(css_tg(css));
-}
-
-static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
- struct cftype *cftype, s64 cfs_quota_us)
-{
- return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
-}
-
-static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_period(css_tg(css));
-}
-
-static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cftype, u64 cfs_period_us)
-{
- return tg_set_cfs_period(css_tg(css), cfs_period_us);
-}
-
-static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return tg_get_cfs_burst(css_tg(css));
-}
-
-static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cftype, u64 cfs_burst_us)
-{
- return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
-}
-
struct cfs_schedulable_data {
struct task_group *tg;
u64 period, quota;
@@ -9762,6 +9808,126 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
return 0;
}
+
+const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
+static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
+/* More than 203 days if BW_SHIFT equals 20. */
+static const u64 max_bw_runtime_us = MAX_BW;
+
+static void tg_bandwidth(struct task_group *tg,
+ u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
+{
+ if (period_us_p)
+ *period_us_p = tg_get_cfs_period(tg);
+ if (quota_us_p)
+ *quota_us_p = tg_get_cfs_quota(tg);
+ if (burst_us_p)
+ *burst_us_p = tg_get_cfs_burst(tg);
+}
+
+static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 period_us;
+
+ tg_bandwidth(css_tg(css), &period_us, NULL, NULL);
+ return period_us;
+}
+
+static int tg_set_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ const u64 max_usec = U64_MAX / NSEC_PER_USEC;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ /* Values should survive translation to nsec */
+ if (period_us > max_usec ||
+ (quota_us != RUNTIME_INF && quota_us > max_usec) ||
+ burst_us > max_usec)
+ return -EINVAL;
+
+ /*
+ * Ensure we have some amount of bandwidth every period. This is to
+ * prevent reaching a state of large arrears when throttled via
+ * entity_tick() resulting in prolonged exit starvation.
+ */
+ if (quota_us < min_bw_quota_period_us ||
+ period_us < min_bw_quota_period_us)
+ return -EINVAL;
+
+ /*
+ * Likewise, bound things on the other side by preventing insane quota
+ * periods. This also allows us to normalize in computing quota
+ * feasibility.
+ */
+ if (period_us > max_bw_quota_period_us)
+ return -EINVAL;
+
+ /*
+ * Bound quota to defend quota against overflow during bandwidth shift.
+ */
+ if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us)
+ return -EINVAL;
+
+ if (quota_us != RUNTIME_INF && (burst_us > quota_us ||
+ burst_us + quota_us > max_bw_runtime_us))
+ return -EINVAL;
+
+ return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 quota_us;
+
+ tg_bandwidth(css_tg(css), NULL, &quota_us, NULL);
+ return quota_us; /* (s64)RUNTIME_INF becomes -1 */
+}
+
+static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 burst_us;
+
+ tg_bandwidth(css_tg(css), NULL, NULL, &burst_us);
+ return burst_us;
+}
+
+static int cpu_period_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 period_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 quota_us, burst_us;
+
+ tg_bandwidth(tg, NULL, &quota_us, &burst_us);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static int cpu_quota_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, s64 quota_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 period_us, burst_us;
+
+ if (quota_us < 0)
+ quota_us = RUNTIME_INF;
+
+ tg_bandwidth(tg, &period_us, NULL, &burst_us);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
+
+static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 burst_us)
+{
+ struct task_group *tg = css_tg(css);
+ u64 period_us, quota_us;
+
+ tg_bandwidth(tg, &period_us, &quota_us, NULL);
+ return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -9807,7 +9973,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
scx_group_set_idle(css_tg(css), idle);
return ret;
}
-#endif
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_GROUP_SCHED_WEIGHT
@@ -9824,19 +9990,19 @@ static struct cftype cpu_legacy_files[] = {
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
- .name = "cfs_quota_us",
- .read_s64 = cpu_cfs_quota_read_s64,
- .write_s64 = cpu_cfs_quota_write_s64,
+ .name = "cfs_period_us",
+ .read_u64 = cpu_period_read_u64,
+ .write_u64 = cpu_period_write_u64,
},
{
- .name = "cfs_period_us",
- .read_u64 = cpu_cfs_period_read_u64,
- .write_u64 = cpu_cfs_period_write_u64,
+ .name = "cfs_quota_us",
+ .read_s64 = cpu_quota_read_s64,
+ .write_s64 = cpu_quota_write_s64,
},
{
.name = "cfs_burst_us",
- .read_u64 = cpu_cfs_burst_read_u64,
- .write_u64 = cpu_cfs_burst_write_u64,
+ .read_u64 = cpu_burst_read_u64,
+ .write_u64 = cpu_burst_write_u64,
},
{
.name = "stat",
@@ -9935,7 +10101,7 @@ static int cpu_extra_stat_show(struct seq_file *sf,
cfs_b->nr_periods, cfs_b->nr_throttled,
throttled_usec, cfs_b->nr_burst, burst_usec);
}
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
return 0;
}
@@ -10033,22 +10199,20 @@ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
}
/* caller should put the current value in *@periodp before calling */
-static int __maybe_unused cpu_period_quota_parse(char *buf,
- u64 *periodp, u64 *quotap)
+static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
+ u64 *quota_us_p)
{
char tok[21]; /* U64_MAX */
- if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
+ if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1)
return -EINVAL;
- *periodp *= NSEC_PER_USEC;
-
- if (sscanf(tok, "%llu", quotap))
- *quotap *= NSEC_PER_USEC;
- else if (!strcmp(tok, "max"))
- *quotap = RUNTIME_INF;
- else
- return -EINVAL;
+ if (sscanf(tok, "%llu", quota_us_p) < 1) {
+ if (!strcmp(tok, "max"))
+ *quota_us_p = RUNTIME_INF;
+ else
+ return -EINVAL;
+ }
return 0;
}
@@ -10057,8 +10221,10 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
static int cpu_max_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
+ u64 period_us, quota_us;
- cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+ tg_bandwidth(tg, &period_us, &quota_us, NULL);
+ cpu_period_quota_print(sf, period_us, quota_us);
return 0;
}
@@ -10066,17 +10232,16 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct task_group *tg = css_tg(of_css(of));
- u64 period = tg_get_cfs_period(tg);
- u64 burst = tg->cfs_bandwidth.burst;
- u64 quota;
+ u64 period_us, quota_us, burst_us;
int ret;
- ret = cpu_period_quota_parse(buf, &period, &quota);
+ tg_bandwidth(tg, &period_us, NULL, &burst_us);
+ ret = cpu_period_quota_parse(buf, &period_us, &quota_us);
if (!ret)
- ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
+ ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us);
return ret ?: nbytes;
}
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
static struct cftype cpu_files[] = {
#ifdef CONFIG_GROUP_SCHED_WEIGHT
@@ -10109,10 +10274,10 @@ static struct cftype cpu_files[] = {
{
.name = "max.burst",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = cpu_cfs_burst_read_u64,
- .write_u64 = cpu_cfs_burst_write_u64,
+ .read_u64 = cpu_burst_read_u64,
+ .write_u64 = cpu_burst_write_u64,
},
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
.name = "uclamp.min",
@@ -10126,7 +10291,7 @@ static struct cftype cpu_files[] = {
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
-#endif
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
{ } /* terminate */
};
@@ -10147,7 +10312,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.threaded = true,
};
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
void dump_cpu_task(int cpu)
{
@@ -10733,7 +10898,7 @@ void sched_mm_cid_fork(struct task_struct *t)
WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
t->mm_cid_active = 1;
}
-#endif
+#endif /* CONFIG_SCHED_MM_CID */
#ifdef CONFIG_SCHED_CLASS_EXT
void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
@@ -10768,4 +10933,4 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
if (ctx->running)
set_next_task(rq, ctx->p);
}
-#endif /* CONFIG_SCHED_CLASS_EXT */
+#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index c4606ca89210..9ede71ecba7f 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -4,6 +4,8 @@
* A simple wrapper around refcount. An allocated sched_core_cookie's
* address is used to compute the cookie of the task.
*/
+#include "sched.h"
+
struct sched_core_cookie {
refcount_t refcnt;
};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 0de9dda09949..23a56ba12d81 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -6,6 +6,8 @@
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
+#include <linux/sched/cputime.h>
+#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 95baa12a1029..cdd740b3f774 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -6,6 +6,7 @@
*
* Author: Juri Lelli <j.lelli@sssup.it>
*/
+#include "sched.h"
static inline int parent(int i)
{
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 0adeda93b5fb..11c0f1faa7e1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,4 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <linux/spinlock.h>
#define IDX_INVALID -1
@@ -15,7 +17,6 @@ struct cpudl {
struct cpudl_item *elements;
};
-#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
void cpudl_clear(struct cpudl *cp, int cpu);
@@ -23,4 +24,3 @@ int cpudl_init(struct cpudl *cp);
void cpudl_set_freecpu(struct cpudl *cp, int cpu);
void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
-#endif /* CONFIG_SMP */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 5252fb191fae..742fb9e62e1a 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,6 +5,7 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 461242ec958a..0ab5f9d4bc59 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -5,6 +5,8 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
+#include <uapi/linux/sched/types.h>
+#include "sched.h"
#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
@@ -380,9 +382,9 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
sg_cpu->saved_idle_calls = idle_calls;
return ret;
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
-#endif /* CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* Make sugov_should_update_freq() ignore the rate limit when DL
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 42c40cfdf836..76a9ac5eb794 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -22,6 +22,7 @@
* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*/
+#include "sched.h"
/*
* p->rt_priority p->prio newpri cpupri
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index d6cba0020064..6f562088c056 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,4 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/atomic.h>
+#include <linux/cpumask.h>
+#include <linux/sched/rt.h>
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
@@ -17,7 +20,6 @@ struct cpupri {
int *cpu_to_pri;
};
-#ifdef CONFIG_SMP
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask);
int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
@@ -26,4 +28,3 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 6dab4854c6c0..7097de2c8cda 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,6 +2,9 @@
/*
* Simple CPU accounting cgroup controller
*/
+#include <linux/sched/cputime.h>
+#include <linux/tsacct_kern.h>
+#include "sched.h"
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
#include <asm/cputime.h>
@@ -88,7 +91,7 @@ static u64 irqtime_tick_accounted(u64 maxtime)
return delta;
}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static u64 irqtime_tick_accounted(u64 dummy)
{
@@ -241,7 +244,7 @@ void __account_forceidle_time(struct task_struct *p, u64 delta)
task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
}
-#endif
+#endif /* CONFIG_SCHED_CORE */
/*
* When a guest is interrupted for a longer amount of time, missed clock
@@ -262,7 +265,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
return steal;
}
-#endif
+#endif /* CONFIG_PARAVIRT */
return 0;
}
@@ -288,7 +291,7 @@ static inline u64 read_sum_exec_runtime(struct task_struct *t)
{
return t->se.sum_exec_runtime;
}
-#else
+#else /* !CONFIG_64BIT: */
static u64 read_sum_exec_runtime(struct task_struct *t)
{
u64 ns;
@@ -301,7 +304,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t)
return ns;
}
-#endif
+#endif /* !CONFIG_64BIT */
/*
* Accumulate raw cputime values of dead tasks (sig->[us]time) and live
@@ -411,11 +414,11 @@ static void irqtime_account_idle_ticks(int ticks)
{
irqtime_account_process_tick(current, 0, ticks);
}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
int nr_ticks) { }
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
/*
* Use precise platform statistics if available:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ad45a8fea245..e2d51f4306b3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,6 +17,10 @@
*/
#include <linux/cpuset.h>
+#include <linux/sched/clock.h>
+#include <uapi/linux/sched/types.h>
+#include "sched.h"
+#include "pelt.h"
/*
* Default limits for DL period; on the top end we guard against small util
@@ -51,7 +55,7 @@ static int __init sched_dl_sysctl_init(void)
return 0;
}
late_initcall(sched_dl_sysctl_init);
-#endif
+#endif /* CONFIG_SYSCTL */
static bool dl_server(struct sched_dl_entity *dl_se)
{
@@ -99,7 +103,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
{
return pi_of(dl_se) != dl_se;
}
-#else
+#else /* !CONFIG_RT_MUTEXES: */
static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
{
return dl_se;
@@ -109,9 +113,8 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
{
return false;
}
-#endif
+#endif /* !CONFIG_RT_MUTEXES */
-#ifdef CONFIG_SMP
static inline struct dl_bw *dl_bw_of(int i)
{
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
@@ -191,35 +194,6 @@ void __dl_update(struct dl_bw *dl_b, s64 bw)
rq->dl.extra_bw += bw;
}
}
-#else
-static inline struct dl_bw *dl_bw_of(int i)
-{
- return &cpu_rq(i)->dl.dl_bw;
-}
-
-static inline int dl_bw_cpus(int i)
-{
- return 1;
-}
-
-static inline unsigned long dl_bw_capacity(int i)
-{
- return SCHED_CAPACITY_SCALE;
-}
-
-bool dl_bw_visited(int cpu, u64 cookie)
-{
- return false;
-}
-
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
- struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
-
- dl->extra_bw += bw;
-}
-#endif
static inline
void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
@@ -552,23 +526,17 @@ void init_dl_rq(struct dl_rq *dl_rq)
{
dl_rq->root = RB_ROOT_CACHED;
-#ifdef CONFIG_SMP
/* zero means no -deadline tasks */
dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
dl_rq->overloaded = 0;
dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED;
-#else
- init_dl_bw(&dl_rq->dl_bw);
-#endif
dl_rq->running_bw = 0;
dl_rq->this_bw = 0;
init_dl_rq_bw_ratio(dl_rq);
}
-#ifdef CONFIG_SMP
-
static inline int dl_overloaded(struct rq *rq)
{
return atomic_read(&rq->rd->dlo_count);
@@ -753,37 +721,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
return later_rq;
}
-#else
-
-static inline
-void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline
-void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline
-void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-}
-
-static inline
-void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
-{
-}
-
-static inline void deadline_queue_push_tasks(struct rq *rq)
-{
-}
-
-static inline void deadline_queue_pull_task(struct rq *rq)
-{
-}
-#endif /* CONFIG_SMP */
-
static void
enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags);
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -824,6 +761,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
+ update_rq_clock(rq);
+
WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
@@ -1195,7 +1134,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se)
static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
{
-#ifdef CONFIG_SMP
/*
* Queueing this task back might have overloaded rq, check if we need
* to kick someone away.
@@ -1209,12 +1147,13 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
push_dl_task(rq);
rq_repin_lock(rq, rf);
}
-#endif
}
/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
+static bool dl_server_stopped(struct sched_dl_entity *dl_se);
+
static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
{
struct rq *rq = rq_of_dl_se(dl_se);
@@ -1234,6 +1173,7 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
if (!dl_se->server_has_tasks(dl_se)) {
replenish_dl_entity(dl_se);
+ dl_server_stopped(dl_se);
return HRTIMER_NORESTART;
}
@@ -1339,7 +1279,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
goto unlock;
}
-#ifdef CONFIG_SMP
if (unlikely(!rq->online)) {
/*
* If the runqueue is no longer available, migrate the
@@ -1356,7 +1295,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* there.
*/
}
-#endif
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (dl_task(rq->donor))
@@ -1504,7 +1442,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64
if (dl_entity_is_special(dl_se))
return;
- scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
+ scaled_delta_exec = delta_exec;
+ if (!dl_server(dl_se))
+ scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec);
dl_se->runtime -= scaled_delta_exec;
@@ -1598,7 +1538,7 @@ throttle:
rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
}
/*
@@ -1611,7 +1551,7 @@ throttle:
*/
void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
{
- s64 delta_exec, scaled_delta_exec;
+ s64 delta_exec;
if (!rq->fair_server.dl_defer)
return;
@@ -1624,9 +1564,7 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
if (delta_exec < 0)
return;
- scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec);
-
- rq->fair_server.runtime -= scaled_delta_exec;
+ rq->fair_server.runtime -= delta_exec;
if (rq->fair_server.runtime < 0) {
rq->fair_server.dl_defer_running = 0;
@@ -1639,31 +1577,17 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p)
void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec)
{
/* 0 runtime = fair server disabled */
- if (dl_se->dl_runtime)
+ if (dl_se->dl_runtime) {
+ dl_se->dl_server_idle = 0;
update_curr_dl_se(dl_se->rq, dl_se, delta_exec);
+ }
}
void dl_server_start(struct sched_dl_entity *dl_se)
{
struct rq *rq = dl_se->rq;
- /*
- * XXX: the apply do not work fine at the init phase for the
- * fair server because things are not yet set. We need to improve
- * this before getting generic.
- */
- if (!dl_server(dl_se)) {
- u64 runtime = 50 * NSEC_PER_MSEC;
- u64 period = 1000 * NSEC_PER_MSEC;
-
- dl_server_apply_params(dl_se, runtime, period, 1);
-
- dl_se->dl_server = 1;
- dl_se->dl_defer = 1;
- setup_new_dl_entity(dl_se);
- }
-
- if (!dl_se->dl_runtime)
+ if (!dl_server(dl_se) || dl_se->dl_server_active)
return;
dl_se->dl_server_active = 1;
@@ -1674,7 +1598,7 @@ void dl_server_start(struct sched_dl_entity *dl_se)
void dl_server_stop(struct sched_dl_entity *dl_se)
{
- if (!dl_se->dl_runtime)
+ if (!dl_server(dl_se) || !dl_server_active(dl_se))
return;
dequeue_dl_entity(dl_se, DEQUEUE_SLEEP);
@@ -1684,6 +1608,20 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
dl_se->dl_server_active = 0;
}
+static bool dl_server_stopped(struct sched_dl_entity *dl_se)
+{
+ if (!dl_se->dl_server_active)
+ return false;
+
+ if (dl_se->dl_server_idle) {
+ dl_server_stop(dl_se);
+ return true;
+ }
+
+ dl_se->dl_server_idle = 1;
+ return false;
+}
+
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task)
@@ -1693,6 +1631,32 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_se->server_pick_task = pick_task;
}
+void sched_init_dl_servers(void)
+{
+ int cpu;
+ struct rq *rq;
+ struct sched_dl_entity *dl_se;
+
+ for_each_online_cpu(cpu) {
+ u64 runtime = 50 * NSEC_PER_MSEC;
+ u64 period = 1000 * NSEC_PER_MSEC;
+
+ rq = cpu_rq(cpu);
+
+ guard(rq_lock_irq)(rq);
+
+ dl_se = &rq->fair_server;
+
+ WARN_ON(dl_server(dl_se));
+
+ dl_server_apply_params(dl_se, runtime, period, 1);
+
+ dl_se->dl_server = 1;
+ dl_se->dl_defer = 1;
+ setup_new_dl_entity(dl_se);
+ }
+}
+
void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
{
u64 new_bw = dl_se->dl_bw;
@@ -1844,8 +1808,6 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
#define __node_2_dle(node) \
rb_entry((node), struct sched_dl_entity, rb_node)
-#ifdef CONFIG_SMP
-
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
{
struct rq *rq = rq_of_dl_rq(dl_rq);
@@ -1881,13 +1843,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
}
}
-#else
-
-static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
-static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
-
-#endif /* CONFIG_SMP */
-
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
@@ -2166,6 +2121,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
if (dl_server(&p->dl))
return;
+ if (task_is_blocked(p))
+ return;
+
if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -2214,8 +2172,6 @@ static void yield_task_dl(struct rq *rq)
rq_clock_skip_update(rq);
}
-#ifdef CONFIG_SMP
-
static inline bool dl_task_is_earliest_deadline(struct task_struct *p,
struct rq *rq)
{
@@ -2345,7 +2301,6 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
return sched_stop_runnable(rq) || sched_dl_runnable(rq);
}
-#endif /* CONFIG_SMP */
/*
* Only called when both the current and waking task are -deadline
@@ -2359,7 +2314,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
return;
}
-#ifdef CONFIG_SMP
/*
* In the unlikely case current and p have the same deadline
* let us try to decide what's the best thing to do...
@@ -2367,7 +2321,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
if ((p->dl.deadline == rq->donor->dl.deadline) &&
!test_tsk_need_resched(rq->curr))
check_preempt_equal_dl(rq, p);
-#endif /* CONFIG_SMP */
}
#ifdef CONFIG_SCHED_HRTICK
@@ -2375,11 +2328,11 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
hrtick_start(rq, dl_se->runtime);
}
-#else /* !CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
}
-#endif
+#endif /* !CONFIG_SCHED_HRTICK */
static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
@@ -2435,7 +2388,7 @@ again:
if (dl_server(dl_se)) {
p = dl_se->server_pick_task(dl_se);
if (!p) {
- if (dl_server_active(dl_se)) {
+ if (!dl_server_stopped(dl_se)) {
dl_se->dl_yielded = 1;
update_curr_dl_se(rq, dl_se, 0);
}
@@ -2465,6 +2418,10 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
+
+ if (task_is_blocked(p))
+ return;
+
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -2500,8 +2457,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-#ifdef CONFIG_SMP
-
/* Only try algorithms three times */
#define DL_MAX_TRIES 3
@@ -2976,7 +2931,14 @@ void dl_clear_root_domain(struct root_domain *rd)
int i;
guard(raw_spinlock_irqsave)(&rd->dl_bw.lock);
+
+ /*
+ * Reset total_bw to zero and extra_bw to max_bw so that next
+ * loop will add dl-servers contributions back properly,
+ */
rd->dl_bw.total_bw = 0;
+ for_each_cpu(i, rd->span)
+ cpu_rq(i)->dl.extra_bw = cpu_rq(i)->dl.max_bw;
/*
* dl_servers are not tasks. Since dl_add_task_root_domain ignores
@@ -2995,8 +2957,6 @@ void dl_clear_root_domain_cpu(int cpu)
dl_clear_root_domain(cpu_rq(cpu)->rd);
}
-#endif /* CONFIG_SMP */
-
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
/*
@@ -3069,10 +3029,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
}
if (rq->donor != p) {
-#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
deadline_queue_push_tasks(rq);
-#endif
if (dl_task(rq->donor))
wakeup_preempt_dl(rq, p, 0);
else
@@ -3092,7 +3050,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
if (!task_on_rq_queued(p))
return;
-#ifdef CONFIG_SMP
/*
* This might be too much, but unfortunately
* we don't have the old deadline value, and
@@ -3121,13 +3078,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
resched_curr(rq);
}
-#else
- /*
- * We don't know if p has a earlier or later deadline, so let's blindly
- * set a (maybe not needed) rescheduling point.
- */
- resched_curr(rq);
-#endif
}
#ifdef CONFIG_SCHED_CORE
@@ -3149,7 +3099,6 @@ DEFINE_SCHED_CLASS(dl) = {
.put_prev_task = put_prev_task_dl,
.set_next_task = set_next_task_dl,
-#ifdef CONFIG_SMP
.balance = balance_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq = migrate_task_rq_dl,
@@ -3158,7 +3107,6 @@ DEFINE_SCHED_CLASS(dl) = {
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
.find_lock_rq = find_lock_later_rq,
-#endif
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
@@ -3242,6 +3190,9 @@ void sched_dl_do_global(void)
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+ for_each_possible_cpu(cpu)
+ init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
+
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
@@ -3257,7 +3208,6 @@ void sched_dl_do_global(void)
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
- init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl);
}
}
@@ -3458,7 +3408,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
return false;
}
-#ifdef CONFIG_SMP
int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
@@ -3570,7 +3519,6 @@ void dl_bw_free(int cpu, u64 dl_bw)
{
dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
}
-#endif
void print_dl_stats(struct seq_file *m, int cpu)
{
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9d71baf08075..3f06ab84d53f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -6,6 +6,9 @@
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*/
+#include <linux/debugfs.h>
+#include <linux/nmi.h>
+#include "sched.h"
/*
* This allows printing both to /sys/kernel/debug/sched/debug and
@@ -90,10 +93,10 @@ static void sched_feat_enable(int i)
{
static_key_enable_cpuslocked(&sched_feat_keys[i]);
}
-#else
+#else /* !CONFIG_JUMP_LABEL: */
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
-#endif /* CONFIG_JUMP_LABEL */
+#endif /* !CONFIG_JUMP_LABEL */
static int sched_feat_set(char *cmp)
{
@@ -166,8 +169,6 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};
-#ifdef CONFIG_SMP
-
static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -214,8 +215,6 @@ static const struct file_operations sched_scaling_fops = {
.release = single_release,
};
-#endif /* SMP */
-
#ifdef CONFIG_PREEMPT_DYNAMIC
static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
@@ -283,7 +282,6 @@ static const struct file_operations sched_dynamic_fops = {
__read_mostly bool sched_debug_verbose;
-#ifdef CONFIG_SMP
static struct dentry *sd_dentry;
@@ -311,9 +309,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
return result;
}
-#else
-#define sched_verbose_write debugfs_write_file_bool
-#endif
static const struct file_operations sched_verbose_fops = {
.read = debugfs_read_file_bool,
@@ -512,7 +507,6 @@ static __init int sched_init_debug(void)
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
-#ifdef CONFIG_SMP
debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
@@ -520,7 +514,6 @@ static __init int sched_init_debug(void)
sched_domains_mutex_lock();
update_sched_domain_debugfs();
sched_domains_mutex_unlock();
-#endif
#ifdef CONFIG_NUMA_BALANCING
numa = debugfs_create_dir("numa_balancing", debugfs_sched);
@@ -530,7 +523,7 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
@@ -540,8 +533,6 @@ static __init int sched_init_debug(void)
}
late_initcall(sched_init_debug);
-#ifdef CONFIG_SMP
-
static cpumask_var_t sd_sysctl_cpus;
static int sd_flags_show(struct seq_file *m, void *v)
@@ -652,8 +643,6 @@ void dirty_sched_domain_sysctl(int cpu)
__cpumask_set_cpu(cpu, sd_sysctl_cpus);
}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_FAIR_GROUP_SCHED
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
@@ -690,18 +679,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
}
P(se->load.weight);
-#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
P(se->avg.runnable_avg);
-#endif
#undef PN_SCHEDSTAT
#undef PN
#undef P_SCHEDSTAT
#undef P
}
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
static DEFINE_SPINLOCK(sched_debug_lock);
@@ -854,7 +841,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
@@ -874,8 +860,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
atomic_long_read(&cfs_rq->tg->load_avg));
-#endif
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_CFS_BANDWIDTH
SEQ_printf(m, " .%-30s: %d\n", "throttled",
cfs_rq->throttled);
@@ -929,11 +914,7 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x))
PU(dl_nr_running);
-#ifdef CONFIG_SMP
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
-#else
- dl_bw = &dl_rq->dl_bw;
-#endif
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
@@ -951,9 +932,9 @@ static void print_cpu(struct seq_file *m, int cpu)
SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
cpu, freq / 1000, (freq % 1000));
}
-#else
+#else /* !CONFIG_X86: */
SEQ_printf(m, "cpu#%d\n", cpu);
-#endif
+#endif /* !CONFIG_X86 */
#define P(x) \
do { \
@@ -976,12 +957,10 @@ do { \
#undef P
#undef PN
-#ifdef CONFIG_SMP
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
P64(avg_idle);
P64(max_idle_balance_cost);
#undef P64
-#endif
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n));
if (schedstat_enabled()) {
@@ -1163,7 +1142,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
show_numa_stats(p, m);
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
}
void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
@@ -1210,10 +1189,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P_SCHEDSTAT(nr_failed_migrations_running);
P_SCHEDSTAT(nr_failed_migrations_hot);
P_SCHEDSTAT(nr_forced_migrations);
-#ifdef CONFIG_NUMA_BALANCING
- P_SCHEDSTAT(numa_task_migrated);
- P_SCHEDSTAT(numa_task_swapped);
-#endif
P_SCHEDSTAT(nr_wakeups);
P_SCHEDSTAT(nr_wakeups_sync);
P_SCHEDSTAT(nr_wakeups_migrate);
@@ -1251,7 +1226,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
__PS("nr_involuntary_switches", p->nivcsw);
P(se.load.weight);
-#ifdef CONFIG_SMP
P(se.avg.load_sum);
P(se.avg.runnable_sum);
P(se.avg.util_sum);
@@ -1260,13 +1234,12 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.avg.util_avg);
P(se.avg.last_update_time);
PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
-#endif
#ifdef CONFIG_UCLAMP_TASK
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
__PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value);
__PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN));
__PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX));
-#endif
+#endif /* CONFIG_UCLAMP_TASK */
P(policy);
P(prio);
if (task_has_dl_policy(p)) {
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2c41c78be61e..7dd5cbcb7a06 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1272,7 +1272,8 @@ static inline struct rq *scx_locked_rq(void)
#define SCX_CALL_OP(sch, mask, op, rq, args...) \
do { \
- update_locked_rq(rq); \
+ if (rq) \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
(sch)->ops.op(args); \
@@ -1280,14 +1281,16 @@ do { \
} else { \
(sch)->ops.op(args); \
} \
- update_locked_rq(NULL); \
+ if (rq) \
+ update_locked_rq(NULL); \
} while (0)
#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \
({ \
__typeof__((sch)->ops.op(args)) __ret; \
\
- update_locked_rq(rq); \
+ if (rq) \
+ update_locked_rq(rq); \
if (mask) { \
scx_kf_allow(mask); \
__ret = (sch)->ops.op(args); \
@@ -1295,7 +1298,8 @@ do { \
} else { \
__ret = (sch)->ops.op(args); \
} \
- update_locked_rq(NULL); \
+ if (rq) \
+ update_locked_rq(NULL); \
__ret; \
})
@@ -4092,6 +4096,11 @@ bool scx_can_stop_tick(struct rq *rq)
DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
static bool scx_cgroup_enabled;
+void scx_tg_init(struct task_group *tg)
+{
+ tg->scx_weight = CGROUP_WEIGHT_DFL;
+}
+
int scx_tg_online(struct task_group *tg)
{
struct scx_sched *sch = scx_root;
@@ -4241,12 +4250,12 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
percpu_down_read(&scx_cgroup_rwsem);
- if (scx_cgroup_enabled && tg->scx_weight != weight) {
- if (SCX_HAS_OP(sch, cgroup_set_weight))
- SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
- tg_cgrp(tg), weight);
- tg->scx_weight = weight;
- }
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
+ tg->scx_weight != weight)
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
+ tg_cgrp(tg), weight);
+
+ tg->scx_weight = weight;
percpu_up_read(&scx_cgroup_rwsem);
}
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 6e5072f57771..a75835c23f15 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -79,6 +79,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
#ifdef CONFIG_CGROUP_SCHED
#ifdef CONFIG_EXT_GROUP_SCHED
+void scx_tg_init(struct task_group *tg);
int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
@@ -88,6 +89,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
#else /* CONFIG_EXT_GROUP_SCHED */
+static inline void scx_tg_init(struct task_group *tg) {}
static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 6d29d3cbc670..001fb88a8481 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -903,7 +903,7 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
* selection optimizations and simply check whether the previously
* used CPU is idle and within the allowed cpumask.
*/
- if (p->nr_cpus_allowed == 1) {
+ if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) {
if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) &&
scx_idle_test_and_clear_cpu(prev_cpu))
cpu = prev_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a14da5396fb..b173a059315c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -88,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str)
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
-#ifdef CONFIG_SMP
/*
* For asym packing, by default the lower numbered CPU has higher priority.
*/
@@ -111,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu)
* (default: ~5%)
*/
#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
-#endif
#ifdef CONFIG_CFS_BANDWIDTH
/*
@@ -162,7 +160,7 @@ static int __init sched_fair_sysctl_init(void)
return 0;
}
late_initcall(sched_fair_sysctl_init);
-#endif
+#endif /* CONFIG_SYSCTL */
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
@@ -471,7 +469,7 @@ static int se_is_idle(struct sched_entity *se)
return cfs_rq_is_idle(group_cfs_rq(se));
}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
#define for_each_sched_entity(se) \
for (; se; se = NULL)
@@ -517,7 +515,7 @@ static int se_is_idle(struct sched_entity *se)
return task_has_idle_policy(task_of(se));
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
@@ -884,23 +882,44 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
}
/*
- * HACK, stash a copy of deadline at the point of pick in vlag,
- * which isn't used until dequeue.
+ * Set the vruntime up to which an entity can run before looking
+ * for another entity to pick.
+ * In case of run to parity, we use the shortest slice of the enqueued
+ * entities to set the protected period.
+ * When run to parity is disabled, we give a minimum quantum to the running
+ * entity to ensure progress.
*/
-static inline void set_protect_slice(struct sched_entity *se)
+static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- se->vlag = se->deadline;
+ u64 slice = normalized_sysctl_sched_base_slice;
+ u64 vprot = se->deadline;
+
+ if (sched_feat(RUN_TO_PARITY))
+ slice = cfs_rq_min_slice(cfs_rq);
+
+ slice = min(slice, se->slice);
+ if (slice != se->slice)
+ vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se));
+
+ se->vprot = vprot;
+}
+
+static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ u64 slice = cfs_rq_min_slice(cfs_rq);
+
+ se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se));
}
static inline bool protect_slice(struct sched_entity *se)
{
- return se->vlag == se->deadline;
+ return ((s64)(se->vprot - se->vruntime) > 0);
}
static inline void cancel_protect_slice(struct sched_entity *se)
{
if (protect_slice(se))
- se->vlag = se->deadline + 1;
+ se->vprot = se->vruntime;
}
/*
@@ -922,7 +941,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
*
* Which allows tree pruning through eligibility.
*/
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
@@ -939,7 +958,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
- if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr))
+ if (curr && protect && protect_slice(curr))
return curr;
/* Pick the leftmost entity if it's eligible */
@@ -983,6 +1002,11 @@ found:
return best;
}
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+ return __pick_eevdf(cfs_rq, true);
+}
+
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -996,7 +1020,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
/**************************************************************
* Scheduling class statistics methods:
*/
-#ifdef CONFIG_SMP
int sched_update_scaling(void)
{
unsigned int factor = get_update_sysctl_factor();
@@ -1008,7 +1031,6 @@ int sched_update_scaling(void)
return 0;
}
-#endif
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
@@ -1041,7 +1063,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#include "pelt.h"
-#ifdef CONFIG_SMP
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
@@ -1131,34 +1152,40 @@ void post_init_entity_util_avg(struct task_struct *p)
sa->runnable_avg = sa->util_avg;
}
-#else /* !CONFIG_SMP */
-void init_entity_runnable_average(struct sched_entity *se)
-{
-}
-void post_init_entity_util_avg(struct task_struct *p)
-{
-}
-static void update_tg_load_avg(struct cfs_rq *cfs_rq)
-{
-}
-#endif /* CONFIG_SMP */
-
-static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
+static s64 update_se(struct rq *rq, struct sched_entity *se)
{
u64 now = rq_clock_task(rq);
s64 delta_exec;
- delta_exec = now - curr->exec_start;
+ delta_exec = now - se->exec_start;
if (unlikely(delta_exec <= 0))
return delta_exec;
- curr->exec_start = now;
- curr->sum_exec_runtime += delta_exec;
+ se->exec_start = now;
+ if (entity_is_task(se)) {
+ struct task_struct *donor = task_of(se);
+ struct task_struct *running = rq->curr;
+ /*
+ * If se is a task, we account the time against the running
+ * task, as w/ proxy-exec they may not be the same.
+ */
+ running->se.exec_start = now;
+ running->se.sum_exec_runtime += delta_exec;
+
+ trace_sched_stat_runtime(running, delta_exec);
+ account_group_exec_runtime(running, delta_exec);
+
+ /* cgroup time is always accounted against the donor */
+ cgroup_account_cputime(donor, delta_exec);
+ } else {
+ /* If not task, account the time against donor se */
+ se->sum_exec_runtime += delta_exec;
+ }
if (schedstat_enabled()) {
struct sched_statistics *stats;
- stats = __schedstats_from_se(curr);
+ stats = __schedstats_from_se(se);
__schedstat_set(stats->exec_max,
max(delta_exec, stats->exec_max));
}
@@ -1166,58 +1193,12 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
return delta_exec;
}
-static inline void update_curr_task(struct task_struct *p, s64 delta_exec)
-{
- trace_sched_stat_runtime(p, delta_exec);
- account_group_exec_runtime(p, delta_exec);
- cgroup_account_cputime(p, delta_exec);
-}
-
-static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- if (!sched_feat(PREEMPT_SHORT))
- return false;
-
- if (curr->vlag == curr->deadline)
- return false;
-
- return !entity_eligible(cfs_rq, curr);
-}
-
-static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
- struct sched_entity *pse, struct sched_entity *se)
-{
- if (!sched_feat(PREEMPT_SHORT))
- return false;
-
- if (pse->slice >= se->slice)
- return false;
-
- if (!entity_eligible(cfs_rq, pse))
- return false;
-
- if (entity_before(pse, se))
- return true;
-
- if (!entity_eligible(cfs_rq, se))
- return true;
-
- return false;
-}
-
/*
* Used by other classes to account runtime.
*/
s64 update_curr_common(struct rq *rq)
{
- struct task_struct *donor = rq->donor;
- s64 delta_exec;
-
- delta_exec = update_curr_se(rq, &donor->se);
- if (likely(delta_exec > 0))
- update_curr_task(donor, delta_exec);
-
- return delta_exec;
+ return update_se(rq, &rq->donor->se);
}
/*
@@ -1225,6 +1206,12 @@ s64 update_curr_common(struct rq *rq)
*/
static void update_curr(struct cfs_rq *cfs_rq)
{
+ /*
+ * Note: cfs_rq->curr corresponds to the task picked to
+ * run (ie: rq->donor.se) which due to proxy-exec may
+ * not necessarily be the actual task running
+ * (rq->curr.se). This is easy to confuse!
+ */
struct sched_entity *curr = cfs_rq->curr;
struct rq *rq = rq_of(cfs_rq);
s64 delta_exec;
@@ -1233,7 +1220,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (unlikely(!curr))
return;
- delta_exec = update_curr_se(rq, curr);
+ delta_exec = update_se(rq, curr);
if (unlikely(delta_exec <= 0))
return;
@@ -1242,10 +1229,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
- struct task_struct *p = task_of(curr);
-
- update_curr_task(p, delta_exec);
-
/*
* If the fair_server is active, we need to account for the
* fair_server time whether or not the task is running on
@@ -1265,7 +1248,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
if (cfs_rq->nr_queued == 1)
return;
- if (resched || did_preempt_short(cfs_rq, curr)) {
+ if (resched || !protect_slice(curr)) {
resched_curr_lazy(rq);
clear_buddies(cfs_rq, curr);
}
@@ -2114,12 +2097,12 @@ static inline int numa_idle_core(int idle_core, int cpu)
return idle_core;
}
-#else
+#else /* !CONFIG_SCHED_SMT: */
static inline int numa_idle_core(int idle_core, int cpu)
{
return idle_core;
}
-#endif
+#endif /* !CONFIG_SCHED_SMT */
/*
* Gather all necessary information to make NUMA balancing placement
@@ -3673,7 +3656,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu)
p->numa_scan_period = task_scan_start(p);
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
+
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
@@ -3690,20 +3674,18 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
{
}
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_NUMA_BALANCING */
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
-#ifdef CONFIG_SMP
if (entity_is_task(se)) {
struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se));
list_add(&se->group_node, &rq->cfs_tasks);
}
-#endif
cfs_rq->nr_queued++;
}
@@ -3711,12 +3693,10 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
-#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
}
-#endif
cfs_rq->nr_queued--;
}
@@ -3768,7 +3748,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*ptr -= min_t(typeof(*ptr), *ptr, _val); \
} while (0)
-#ifdef CONFIG_SMP
static inline void
enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -3785,12 +3764,6 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
-#else
-static inline void
-enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-static inline void
-dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
-#endif
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
@@ -3822,13 +3795,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
update_load_set(&se->load, weight);
-#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
-#endif
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
@@ -3863,7 +3834,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
/*
* All this does is approximate the hierarchical proportion which includes that
* global sum we all love to hate.
@@ -3970,7 +3940,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
*/
return clamp_t(long, shares, MIN_SHARES, tg_shares);
}
-#endif /* CONFIG_SMP */
/*
* Recomputes the group entity based on the current state of its group
@@ -3991,20 +3960,16 @@ static void update_cfs_group(struct sched_entity *se)
if (throttled_hierarchy(gcfs_rq))
return;
-#ifndef CONFIG_SMP
- shares = READ_ONCE(gcfs_rq->tg->shares);
-#else
shares = calc_group_shares(gcfs_rq);
-#endif
if (unlikely(se->load.weight != shares))
reweight_entity(cfs_rq_of(se), se, shares);
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void update_cfs_group(struct sched_entity *se)
{
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
@@ -4029,7 +3994,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
}
}
-#ifdef CONFIG_SMP
static inline bool load_avg_is_decayed(struct sched_avg *sa)
{
if (sa->load_sum)
@@ -4481,7 +4445,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
return true;
}
-#else /* CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
@@ -4494,7 +4458,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_NO_HZ_COMMON
static inline void migrate_se_pelt_lag(struct sched_entity *se)
@@ -4575,9 +4539,9 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se)
__update_load_avg_blocked_se(now, se);
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static void migrate_se_pelt_lag(struct sched_entity *se) {}
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
@@ -5144,48 +5108,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
-#else /* CONFIG_SMP */
-
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
- return !cfs_rq->nr_queued;
-}
-
-#define UPDATE_TG 0x0
-#define SKIP_AGE_LOAD 0x0
-#define DO_ATTACH 0x0
-#define DO_DETACH 0x0
-
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
-{
- cfs_rq_util_change(cfs_rq, 0);
-}
-
-static inline void remove_entity_load_avg(struct sched_entity *se) {}
-
-static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-
-static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf)
-{
- return 0;
-}
-
-static inline void
-util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
-
-static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
-
-static inline void
-util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
- bool task_sleep) {}
-static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
-
-#endif /* CONFIG_SMP */
-
void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_entity *se = &p->se;
@@ -5253,7 +5175,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
* = (W*V + w_i*(V - vl_i)) / (W + w_i)
* = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
- * = (V*(W + w_i) - w_i*l) / (W + w_i)
+ * = (V*(W + w_i) - w_i*vl_i) / (W + w_i)
* = V - w_i*vl_i / (W + w_i)
*
* And the actual lag after adding an entity with vl_i is:
@@ -5550,7 +5472,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- set_protect_slice(se);
+ set_protect_slice(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
@@ -5685,7 +5607,7 @@ void cfs_bandwidth_usage_dec(void)
{
static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
-#else /* CONFIG_JUMP_LABEL */
+#else /* !CONFIG_JUMP_LABEL: */
static bool cfs_bandwidth_used(void)
{
return true;
@@ -5693,16 +5615,7 @@ static bool cfs_bandwidth_used(void)
void cfs_bandwidth_usage_inc(void) {}
void cfs_bandwidth_usage_dec(void) {}
-#endif /* CONFIG_JUMP_LABEL */
-
-/*
- * default period for cfs group bandwidth.
- * default: 0.1s, units: nanoseconds
- */
-static inline u64 default_cfs_period(void)
-{
- return 100000000ULL;
-}
+#endif /* !CONFIG_JUMP_LABEL */
static inline u64 sched_cfs_bandwidth_slice(void)
{
@@ -5889,7 +5802,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long queued_delta, runnable_delta, idle_delta, dequeue = 1;
- long rq_h_nr_queued = rq->cfs.h_nr_queued;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5973,10 +5885,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, queued_delta);
-
- /* Stop the fair server if throttling resulted in no runnable tasks */
- if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
- dl_server_stop(&rq->fair_server);
done:
/*
* Note: distribution will already see us throttled via the
@@ -6088,7 +5996,6 @@ unthrottle_throttle:
resched_curr(rq);
}
-#ifdef CONFIG_SMP
static void __cfsb_csd_unthrottle(void *arg)
{
struct cfs_rq *cursor, *tmp;
@@ -6147,12 +6054,6 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
if (first)
smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
}
-#else
-static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
-{
- unthrottle_cfs_rq(cfs_rq);
-}
-#endif
static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
@@ -6490,8 +6391,6 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-extern const u64 max_cfs_quota_period;
-
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -6518,7 +6417,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
* to fail.
*/
new = old * 2;
- if (new < max_cfs_quota_period) {
+ if (new < max_bw_quota_period_us * NSEC_PER_USEC) {
cfs_b->period = ns_to_ktime(new);
cfs_b->quota *= 2;
cfs_b->burst *= 2;
@@ -6552,7 +6451,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren
raw_spin_lock_init(&cfs_b->lock);
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
- cfs_b->period = ns_to_ktime(default_cfs_period());
+ cfs_b->period = us_to_ktime(default_bw_period_us());
cfs_b->burst = 0;
cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
@@ -6608,7 +6507,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* guaranteed at this point that no additional cfs_rq of this group can
* join a CSD list.
*/
-#ifdef CONFIG_SMP
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
unsigned long flags;
@@ -6620,7 +6518,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
__cfsb_csd_unthrottle(rq);
local_irq_restore(flags);
}
-#endif
}
/*
@@ -6733,9 +6630,9 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
if (cfs_task_bw_constrained(p))
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
-#endif
+#endif /* CONFIG_NO_HZ_FULL */
-#else /* CONFIG_CFS_BANDWIDTH */
+#else /* !CONFIG_CFS_BANDWIDTH: */
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
@@ -6777,7 +6674,7 @@ bool cfs_task_bw_constrained(struct task_struct *p)
return false;
}
#endif
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* !CONFIG_CFS_BANDWIDTH */
#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
@@ -6822,7 +6719,7 @@ static void hrtick_update(struct rq *rq)
hrtick_start_fair(rq, donor);
}
-#else /* !CONFIG_SCHED_HRTICK */
+#else /* !CONFIG_SCHED_HRTICK: */
static inline void
hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
@@ -6831,9 +6728,8 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
static inline void hrtick_update(struct rq *rq)
{
}
-#endif
+#endif /* !CONFIG_SCHED_HRTICK */
-#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
unsigned long rq_util_min, rq_util_max;
@@ -6875,9 +6771,6 @@ static inline void check_update_overutilized_status(struct rq *rq)
if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
set_rd_overutilized(rq->rd, 1);
}
-#else
-static inline void check_update_overutilized_status(struct rq *rq) { }
-#endif
/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
@@ -6886,12 +6779,10 @@ static int sched_idle_rq(struct rq *rq)
rq->nr_running);
}
-#ifdef CONFIG_SMP
static int sched_idle_cpu(int cpu)
{
return sched_idle_rq(cpu_rq(cpu));
}
-#endif
static void
requeue_delayed_entity(struct sched_entity *se)
@@ -7070,7 +6961,6 @@ static void set_next_buddy(struct sched_entity *se);
static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
bool was_sched_idle = sched_idle_rq(rq);
- int rq_h_nr_queued = rq->cfs.h_nr_queued;
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
struct task_struct *p = NULL;
@@ -7154,9 +7044,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
sub_nr_running(rq, h_nr_queued);
- if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
- dl_server_stop(&rq->fair_server);
-
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
@@ -7206,8 +7093,6 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
}
-#ifdef CONFIG_SMP
-
/* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
@@ -7677,7 +7562,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
return -1;
}
-#else /* CONFIG_SCHED_SMT */
+#else /* !CONFIG_SCHED_SMT: */
static inline void set_idle_cores(int cpu, int val)
{
@@ -7698,7 +7583,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
return -1;
}
-#endif /* CONFIG_SCHED_SMT */
+#endif /* !CONFIG_SCHED_SMT */
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
@@ -8743,9 +8628,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return sched_balance_newidle(rq, rf) != 0;
}
-#else
-static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
-#endif /* CONFIG_SMP */
static void set_next_buddy(struct sched_entity *se)
{
@@ -8767,6 +8649,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
struct sched_entity *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
+ bool do_preempt_short = false;
if (unlikely(se == pse))
return;
@@ -8815,7 +8698,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* When non-idle entity preempt an idle entity,
* don't give idle entity slice protection.
*/
- cancel_protect_slice(se);
+ do_preempt_short = true;
goto preempt;
}
@@ -8833,22 +8716,24 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
/*
* If @p has a shorter slice than current and @p is eligible, override
* current's slice protection in order to allow preemption.
- *
- * Note that even if @p does not turn out to be the most eligible
- * task at this moment, current's slice protection will be lost.
*/
- if (do_preempt_short(cfs_rq, pse, se))
- cancel_protect_slice(se);
+ do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice);
/*
* If @p has become the most eligible task, force preemption.
*/
- if (pick_eevdf(cfs_rq) == pse)
+ if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse)
goto preempt;
+ if (sched_feat(RUN_TO_PARITY) && do_preempt_short)
+ update_protect_slice(cfs_rq, se);
+
return;
preempt:
+ if (do_preempt_short)
+ cancel_protect_slice(se);
+
resched_curr_lazy(rq);
}
@@ -8939,7 +8824,7 @@ again:
return p;
simple:
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
put_prev_set_next_task(rq, prev, p);
return p;
@@ -9055,7 +8940,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
return true;
}
-#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods.
*
@@ -9357,13 +9241,13 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
return src_weight - dst_weight;
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
static inline long migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
return 0;
}
-#endif
+#endif /* !CONFIG_NUMA_BALANCING */
/*
* Check whether the task is ineligible on the destination cpu
@@ -9407,7 +9291,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) throttled_lb_pair, or
* 3) cannot be migrated to this CPU due to cpus_ptr, or
* 4) running (obviously), or
- * 5) are cache-hot on their current CPU.
+ * 5) are cache-hot on their current CPU, or
+ * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
*/
if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
return 0;
@@ -9429,6 +9314,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (kthread_is_per_cpu(p))
return 0;
+ if (task_is_blocked(p))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
@@ -9464,7 +9352,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Record that we found at least one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
- if (task_on_cpu(env->src_rq, p)) {
+ if (task_on_cpu(env->src_rq, p) ||
+ task_current_donor(env->src_rq, p)) {
schedstat_inc(p->stats.nr_failed_migrations_running);
return 0;
}
@@ -9508,6 +9397,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
schedstat_inc(p->stats.nr_forced_migrations);
}
+ WARN_ON(task_current(env->src_rq, p));
+ WARN_ON(task_current_donor(env->src_rq, p));
+
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -9772,12 +9664,12 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
if (!has_blocked)
rq->has_blocked_load = 0;
}
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
static inline bool others_have_blocked(struct rq *rq) { return false; }
static inline void update_blocked_load_tick(struct rq *rq) {}
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
static bool __update_blocked_others(struct rq *rq, bool *done)
{
@@ -9886,7 +9778,7 @@ static unsigned long task_h_load(struct task_struct *p)
return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
cfs_rq_load_avg(cfs_rq) + 1);
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static bool __update_blocked_fair(struct rq *rq, bool *done)
{
struct cfs_rq *cfs_rq = &rq->cfs;
@@ -9903,7 +9795,7 @@ static unsigned long task_h_load(struct task_struct *p)
{
return p->se.avg.load_avg;
}
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static void sched_balance_update_blocked_averages(int cpu)
{
@@ -10048,9 +9940,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
min_capacity = ULONG_MAX;
max_capacity = 0;
- if (child->flags & SD_OVERLAP) {
+ if (child->flags & SD_NUMA) {
/*
- * SD_OVERLAP domains cannot assume that child groups
+ * SD_NUMA domains cannot assume that child groups
* span the current group.
*/
@@ -10063,7 +9955,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
} else {
/*
- * !SD_OVERLAP domains can assume that child groups
+ * !SD_NUMA domains can assume that child groups
* span the current group.
*/
@@ -10616,7 +10508,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
return remote;
return all;
}
-#else
+#else /* !CONFIG_NUMA_BALANCING: */
static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
{
return all;
@@ -10626,7 +10518,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
{
return regular;
}
-#endif /* CONFIG_NUMA_BALANCING */
+#endif /* !CONFIG_NUMA_BALANCING */
struct sg_lb_stats;
@@ -12174,8 +12066,14 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
/*
* Track max cost of a domain to make sure to not delay the
* next wakeup on the CPU.
+ *
+ * sched_balance_newidle() bumps the cost whenever newidle
+ * balance fails, and we don't want things to grow out of
+ * control. Use the sysctl_sched_migration_cost as the upper
+ * limit, plus a litle extra to avoid off by ones.
*/
- sd->max_newidle_lb_cost = cost;
+ sd->max_newidle_lb_cost =
+ min(cost, sysctl_sched_migration_cost + 200);
sd->last_decay_max_lb_cost = jiffies;
} else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
/*
@@ -12772,7 +12670,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
}
-#else /* !CONFIG_NO_HZ_COMMON */
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline void nohz_balancer_kick(struct rq *rq) { }
static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
@@ -12781,7 +12679,7 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle
}
static inline void nohz_newidle_balance(struct rq *this_rq) { }
-#endif /* CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* sched_balance_newidle is called by schedule() if this_cpu is about to become
@@ -12867,10 +12765,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t1 = sched_clock_cpu(this_cpu);
domain_cost = t1 - t0;
- update_newidle_cost(sd, domain_cost);
-
curr_cost += domain_cost;
t0 = t1;
+
+ /*
+ * Failing newidle means it is not effective;
+ * bump the cost so we end up doing less of it.
+ */
+ if (!pulled_task)
+ domain_cost = (3 * sd->max_newidle_lb_cost) / 2;
+
+ update_newidle_cost(sd, domain_cost);
}
/*
@@ -12978,8 +12883,6 @@ static void rq_offline_fair(struct rq *rq)
clear_tg_offline_cfs_rqs(rq);
}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_SCHED_CORE
static inline bool
__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
@@ -13076,10 +12979,10 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
cfs_rqa = sea->cfs_rq;
cfs_rqb = seb->cfs_rq;
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
cfs_rqa = &task_rq(a)->cfs;
cfs_rqb = &task_rq(b)->cfs;
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
/*
* Find delta after normalizing se's vruntime with its cfs_rq's
@@ -13103,9 +13006,9 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu)
#endif
return throttled_hierarchy(cfs_rq);
}
-#else
+#else /* !CONFIG_SCHED_CORE: */
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
-#endif
+#endif /* !CONFIG_SCHED_CORE */
/*
* scheduler tick hitting a task of our scheduling class.
@@ -13199,15 +13102,14 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
list_add_leaf_cfs_rq(cfs_rq);
}
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static void propagate_entity_cfs_rq(struct sched_entity *se) { }
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static void detach_entity_cfs_rq(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_SMP
/*
* In case the task sched_avg hasn't been attached:
* - A forked task which hasn't been woken up by wake_up_new_task().
@@ -13216,7 +13118,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
*/
if (!se->avg.last_update_time)
return;
-#endif
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
@@ -13280,7 +13181,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
{
struct sched_entity *se = &p->se;
-#ifdef CONFIG_SMP
if (task_on_rq_queued(p)) {
/*
* Move the next running task to the front of the list, so our
@@ -13288,7 +13188,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
*/
list_move(&se->group_node, &rq->cfs_tasks);
}
-#endif
if (!first)
return;
@@ -13326,9 +13225,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
-#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -13343,10 +13240,8 @@ static void task_change_group_fair(struct task_struct *p)
detach_task_cfs_rq(p);
-#ifdef CONFIG_SMP
/* Tell se's cfs_rq has been changed -- migrated */
p->se.avg.last_update_time = 0;
-#endif
set_task_rq(p, task_cpu(p));
attach_task_cfs_rq(p);
}
@@ -13642,7 +13537,6 @@ DEFINE_SCHED_CLASS(fair) = {
.put_prev_task = put_prev_task_fair,
.set_next_task = set_next_task_fair,
-#ifdef CONFIG_SMP
.balance = balance_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq = migrate_task_rq_fair,
@@ -13652,7 +13546,6 @@ DEFINE_SCHED_CLASS(fair) = {
.task_dead = task_dead_fair,
.set_cpus_allowed = set_cpus_allowed_fair,
-#endif
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
@@ -13715,7 +13608,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
__init void init_sched_fair_class(void)
{
-#ifdef CONFIG_SMP
int i;
for_each_possible_cpu(i) {
@@ -13737,6 +13629,4 @@ __init void init_sched_fair_class(void)
nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
-#endif /* SMP */
-
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2c85c86b455f..c39b089d4f09 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -6,6 +6,11 @@
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
+#include <linux/cpuidle.h>
+#include <linux/suspend.h>
+#include <linux/livepatch.h>
+#include "sched.h"
+#include "smp.h"
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -47,7 +52,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
-#endif
+#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */
static noinline int __cpuidle cpu_idle_poll(void)
{
@@ -95,10 +100,10 @@ static inline void cond_tick_broadcast_exit(void)
if (static_branch_unlikely(&arch_needs_tick_broadcast))
tick_broadcast_exit();
}
-#else
+#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */
static inline void cond_tick_broadcast_enter(void) { }
static inline void cond_tick_broadcast_exit(void) { }
-#endif
+#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */
/**
* default_idle_call - Default CPU idle routine.
@@ -427,7 +432,6 @@ void cpu_startup_entry(enum cpuhp_state state)
* idle-task scheduling class.
*/
-#ifdef CONFIG_SMP
static int
select_task_rq_idle(struct task_struct *p, int cpu, int flags)
{
@@ -439,7 +443,6 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
return WARN_ON_ONCE(1);
}
-#endif
/*
* Idle tasks are unconditionally rescheduled:
@@ -526,11 +529,9 @@ DEFINE_SCHED_CLASS(idle) = {
.put_prev_task = put_prev_task_idle,
.set_next_task = set_next_task_idle,
-#ifdef CONFIG_SMP
.balance = balance_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
-#endif
.task_tick = task_tick_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 93b038d48900..a4cf17b1fab0 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -7,6 +7,8 @@
* Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
*
*/
+#include <linux/sched/isolation.h>
+#include "sched.h"
enum hk_flags {
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index c48900b856a2..b601e0243d0e 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,6 +6,8 @@
* figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels.
*/
+#include <linux/sched/nohz.h>
+#include "sched.h"
/*
* Global load-average calculations
@@ -80,7 +82,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust)
long nr_active, delta = 0;
nr_active = this_rq->nr_running - adjust;
- nr_active += (int)this_rq->nr_uninterruptible;
+ nr_active += (long)this_rq->nr_uninterruptible;
if (nr_active != this_rq->calc_load_active) {
delta = nr_active - this_rq->calc_load_active;
@@ -333,12 +335,12 @@ static void calc_global_nohz(void)
smp_wmb();
calc_load_idx++;
}
-#else /* !CONFIG_NO_HZ_COMMON */
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline long calc_load_nohz_read(void) { return 0; }
static inline void calc_global_nohz(void) { }
-#endif /* CONFIG_NO_HZ_COMMON */
+#endif /* !CONFIG_NO_HZ_COMMON */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 809194cd779f..62fba83b7bb1 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -4,6 +4,8 @@
*
* membarrier system call
*/
+#include <uapi/linux/membarrier.h>
+#include "sched.h"
/*
* For documentation purposes, here are some membarrier ordering
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 7a8534a2deff..fa83bbaf4f3e 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -23,6 +23,7 @@
* Move PELT related code from fair.c into this pelt.c file
* Author: Vincent Guittot <vincent.guittot@linaro.org>
*/
+#include "pelt.h"
/*
* Approximate:
@@ -413,7 +414,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
return 0;
}
-#endif
+#endif /* CONFIG_SCHED_HW_PRESSURE */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
@@ -466,7 +467,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
return ret;
}
-#endif
+#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */
/*
* Load avg and utiliztion metrics need to be updated periodically and before
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index f4f6a0875c66..62c3fa543c0f 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,4 +1,8 @@
-#ifdef CONFIG_SMP
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _KERNEL_SCHED_PELT_H
+#define _KERNEL_SCHED_PELT_H
+#include "sched.h"
+
#include "sched-pelt.h"
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
@@ -15,7 +19,7 @@ static inline u64 hw_load_avg(struct rq *rq)
{
return READ_ONCE(rq->avg_hw.load_avg);
}
-#else
+#else /* !CONFIG_SCHED_HW_PRESSURE: */
static inline int
update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
@@ -26,7 +30,7 @@ static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
-#endif
+#endif /* !CONFIG_SCHED_HW_PRESSURE */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
@@ -174,63 +178,12 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
-#else
+#else /* !CONFIG_CFS_BANDWIDTH: */
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
return rq_clock_pelt(rq_of(cfs_rq));
}
-#endif
-
-#else
-
-static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
-{
- return 0;
-}
-
-static inline int
-update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
-{
- return 0;
-}
-
-static inline int
-update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
-{
- return 0;
-}
-
-static inline int
-update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
-{
- return 0;
-}
-
-static inline u64 hw_load_avg(struct rq *rq)
-{
- return 0;
-}
-
-static inline int
-update_irq_load_avg(struct rq *rq, u64 running)
-{
- return 0;
-}
-
-static inline u64 rq_clock_pelt(struct rq *rq)
-{
- return rq_clock_task(rq);
-}
-
-static inline void
-update_rq_clock_pelt(struct rq *rq, s64 delta) { }
-
-static inline void
-update_idle_rq_clock_pelt(struct rq *rq) { }
-
-static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
-#endif
-
+#endif /* !CONFIG_CFS_BANDWIDTH */
+#endif /* _KERNEL_SCHED_PELT_H */
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ad04a5c3162a..2024c1d36402 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -136,6 +136,10 @@
* cost-wise, yet way more sensitive and accurate than periodic
* sampling of the aggregate task states would be.
*/
+#include <linux/sched/clock.h>
+#include <linux/workqueue.h>
+#include <linux/psi.h>
+#include "sched.h"
static int psi_bug __read_mostly;
@@ -172,6 +176,28 @@ struct psi_group psi_system = {
.pcpu = &system_group_pcpu,
};
+static DEFINE_PER_CPU(seqcount_t, psi_seq);
+
+static inline void psi_write_begin(int cpu)
+{
+ write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline void psi_write_end(int cpu)
+{
+ write_seqcount_end(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline u32 psi_read_begin(int cpu)
+{
+ return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu));
+}
+
+static inline bool psi_read_retry(int cpu, u32 seq)
+{
+ return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq);
+}
+
static void psi_avgs_work(struct work_struct *work);
static void poll_timer_fn(struct timer_list *t);
@@ -182,7 +208,7 @@ static void group_init(struct psi_group *group)
group->enabled = true;
for_each_possible_cpu(cpu)
- seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
+ seqcount_init(per_cpu_ptr(&psi_seq, cpu));
group->avg_last_update = sched_clock();
group->avg_next_update = group->avg_last_update + psi_period;
mutex_init(&group->avgs_lock);
@@ -262,14 +288,14 @@ static void get_recent_times(struct psi_group *group, int cpu,
/* Snapshot a coherent view of the CPU state */
do {
- seq = read_seqcount_begin(&groupc->seq);
+ seq = psi_read_begin(cpu);
now = cpu_clock(cpu);
memcpy(times, groupc->times, sizeof(groupc->times));
state_mask = groupc->state_mask;
state_start = groupc->state_start;
if (cpu == current_cpu)
memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
- } while (read_seqcount_retry(&groupc->seq, seq));
+ } while (psi_read_retry(cpu, seq));
/* Calculate state time deltas against the previous snapshot */
for (s = 0; s < NR_PSI_STATES; s++) {
@@ -768,31 +794,21 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
groupc->times[PSI_NONIDLE] += delta;
}
+#define for_each_group(iter, group) \
+ for (typeof(group) iter = group; iter; iter = iter->parent)
+
static void psi_group_change(struct psi_group *group, int cpu,
unsigned int clear, unsigned int set,
- bool wake_clock)
+ u64 now, bool wake_clock)
{
struct psi_group_cpu *groupc;
unsigned int t, m;
u32 state_mask;
- u64 now;
lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
- * First we update the task counts according to the state
- * change requested through the @clear and @set bits.
- *
- * Then if the cgroup PSI stats accounting enabled, we
- * assess the aggregate resource states this CPU's tasks
- * have been in since the last change, and account any
- * SOME and FULL time these may have resulted in.
- */
- write_seqcount_begin(&groupc->seq);
- now = cpu_clock(cpu);
-
- /*
* Start with TSK_ONCPU, which doesn't have a corresponding
* task count - it's just a boolean flag directly encoded in
* the state mask. Clear, set, or carry the current state if
@@ -843,7 +859,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
groupc->state_mask = state_mask;
- write_seqcount_end(&groupc->seq);
return;
}
@@ -864,8 +879,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
groupc->state_mask = state_mask;
- write_seqcount_end(&groupc->seq);
-
if (state_mask & group->rtpoll_states)
psi_schedule_rtpoll_work(group, 1, false);
@@ -900,24 +913,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set)
void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
- struct psi_group *group;
+ u64 now;
if (!task->pid)
return;
psi_flags_change(task, clear, set);
- group = task_psi_group(task);
- do {
- psi_group_change(group, cpu, clear, set, true);
- } while ((group = group->parent));
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(task))
+ psi_group_change(group, cpu, clear, set, now, true);
+ psi_write_end(cpu);
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep)
{
- struct psi_group *group, *common = NULL;
+ struct psi_group *common = NULL;
int cpu = task_cpu(prev);
+ u64 now;
+
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
if (next->pid) {
psi_flags_change(next, 0, TSK_ONCPU);
@@ -926,16 +944,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
* ancestors with @prev, those will already have @prev's
* TSK_ONCPU bit set, and we can stop the iteration there.
*/
- group = task_psi_group(next);
- do {
- if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
- PSI_ONCPU) {
+ for_each_group(group, task_psi_group(next)) {
+ struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+
+ if (groupc->state_mask & PSI_ONCPU) {
common = group;
break;
}
-
- psi_group_change(group, cpu, 0, TSK_ONCPU, true);
- } while ((group = group->parent));
+ psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+ }
}
if (prev->pid) {
@@ -968,12 +985,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
psi_flags_change(prev, clear, set);
- group = task_psi_group(prev);
- do {
+ for_each_group(group, task_psi_group(prev)) {
if (group == common)
break;
- psi_group_change(group, cpu, clear, set, wake_clock);
- } while ((group = group->parent));
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
+ }
/*
* TSK_ONCPU is handled up to the common ancestor. If there are
@@ -983,20 +999,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
*/
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
- for (; group; group = group->parent)
- psi_group_change(group, cpu, clear, set, wake_clock);
+ for_each_group(group, common)
+ psi_group_change(group, cpu, clear, set, now, wake_clock);
}
}
+ psi_write_end(cpu);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
int cpu = task_cpu(curr);
- struct psi_group *group;
struct psi_group_cpu *groupc;
s64 delta;
u64 irq;
+ u64 now;
if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
return;
@@ -1005,8 +1022,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
return;
lockdep_assert_rq_held(rq);
- group = task_psi_group(curr);
- if (prev && task_psi_group(prev) == group)
+ if (prev && task_psi_group(prev) == task_psi_group(curr))
return;
irq = irq_time_read(cpu);
@@ -1015,27 +1031,24 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
return;
rq->psi_irq_time = irq;
- do {
- u64 now;
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ for_each_group(group, task_psi_group(curr)) {
if (!group->enabled)
continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
- write_seqcount_begin(&groupc->seq);
- now = cpu_clock(cpu);
-
record_times(groupc, now);
groupc->times[PSI_IRQ_FULL] += delta;
- write_seqcount_end(&groupc->seq);
-
if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
psi_schedule_rtpoll_work(group, 1, false);
- } while ((group = group->parent));
+ }
+ psi_write_end(cpu);
}
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/**
* psi_memstall_enter - mark the beginning of a memory stall section
@@ -1221,12 +1234,14 @@ void psi_cgroup_restart(struct psi_group *group)
return;
for_each_possible_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
+ u64 now;
+
+ guard(rq_lock_irq)(cpu_rq(cpu));
- rq_lock_irq(rq, &rf);
- psi_group_change(group, cpu, 0, 0, true);
- rq_unlock_irq(rq, &rf);
+ psi_write_begin(cpu);
+ now = cpu_clock(cpu);
+ psi_group_change(group, cpu, 0, 0, now, true);
+ psi_write_end(cpu);
}
}
#endif /* CONFIG_CGROUPS */
@@ -1651,7 +1666,7 @@ static const struct proc_ops psi_irq_proc_ops = {
.proc_poll = psi_fop_poll,
.proc_release = psi_fop_release,
};
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
static int __init psi_proc_init(void)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e40422c37033..7936d4333731 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -4,6 +4,9 @@
* policies)
*/
+#include "sched.h"
+#include "pelt.h"
+
int sched_rr_timeslice = RR_TIMESLICE;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = MAX_BW;
@@ -60,7 +63,7 @@ static int __init sched_rt_sysctl_init(void)
return 0;
}
late_initcall(sched_rt_sysctl_init);
-#endif
+#endif /* CONFIG_SYSCTL */
void init_rt_rq(struct rt_rq *rt_rq)
{
@@ -75,12 +78,10 @@ void init_rt_rq(struct rt_rq *rt_rq)
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
-#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -291,7 +292,7 @@ err:
return 0;
}
-#else /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
#define rt_entity_is_task(rt_se) (1)
@@ -327,9 +328,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
return 1;
}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-#ifdef CONFIG_SMP
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
@@ -430,21 +429,6 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
}
}
-#else
-
-static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void rt_queue_push_tasks(struct rq *rq)
-{
-}
-#endif /* CONFIG_SMP */
-
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
@@ -485,12 +469,12 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
return cpu_cap >= min(min_cap, max_cap);
}
-#else
+#else /* !CONFIG_UCLAMP_TASK: */
static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
{
return true;
}
-#endif
+#endif /* !CONFIG_UCLAMP_TASK */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -594,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
return p->prio != p->normal_prio;
}
-#ifdef CONFIG_SMP
static inline const struct cpumask *sched_rt_period_mask(void)
{
return this_rq()->rd->span;
}
-#else
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-#endif
static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
@@ -625,7 +602,6 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
rt_rq->rt_time < rt_b->rt_runtime);
}
-#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
*/
@@ -798,9 +774,6 @@ static void balance_runtime(struct rt_rq *rt_rq)
raw_spin_lock(&rt_rq->rt_runtime_lock);
}
}
-#else /* !CONFIG_SMP */
-static inline void balance_runtime(struct rt_rq *rt_rq) {}
-#endif /* CONFIG_SMP */
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
@@ -930,7 +903,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
}
-#else /* !CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
typedef struct rt_rq *rt_rq_iter_t;
@@ -977,12 +950,10 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
return &cpu_rq(cpu)->rt;
}
-#ifdef CONFIG_SMP
static void __enable_runtime(struct rq *rq) { }
static void __disable_runtime(struct rq *rq) { }
-#endif
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
@@ -1033,7 +1004,7 @@ static void update_curr_rt(struct rq *rq)
do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
}
static void
@@ -1075,8 +1046,6 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
cpufreq_update_util(rq, 0);
}
-#if defined CONFIG_SMP
-
static void
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
@@ -1107,16 +1076,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
-#else /* CONFIG_SMP */
-
-static inline
-void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-static inline
-void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
-
-#endif /* CONFIG_SMP */
-
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
static void
inc_rt_prio(struct rt_rq *rt_rq, int prio)
{
@@ -1155,13 +1114,6 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
dec_rt_prio_smp(rt_rq, prio, prev_prio);
}
-#else
-
-static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
-static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
-
-#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
-
#ifdef CONFIG_RT_GROUP_SCHED
static void
@@ -1182,7 +1134,7 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
}
-#else /* CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
@@ -1192,7 +1144,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
static inline
void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
@@ -1488,6 +1440,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags);
+ if (task_is_blocked(p))
+ return;
+
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1538,7 +1493,6 @@ static void yield_task_rt(struct rq *rq)
requeue_task_rt(rq, rq->curr, 0);
}
-#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
static int
@@ -1653,7 +1607,6 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
}
-#endif /* CONFIG_SMP */
/*
* Preempt the current task with a newly woken task if needed:
@@ -1667,7 +1620,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
return;
}
-#ifdef CONFIG_SMP
/*
* If:
*
@@ -1682,7 +1634,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
*/
if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr))
check_preempt_equal_prio(rq, p);
-#endif
}
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
@@ -1768,6 +1719,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
+ if (task_is_blocked(p))
+ return;
/*
* The previous task needs to be made eligible for pushing
* if it is still active
@@ -1776,8 +1729,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s
enqueue_pushable_task(rq, p);
}
-#ifdef CONFIG_SMP
-
/* Only try algorithms three times */
#define RT_MAX_TRIES 3
@@ -2451,7 +2402,6 @@ void __init init_sched_rt_class(void)
GFP_KERNEL, cpu_to_node(i));
}
}
-#endif /* CONFIG_SMP */
/*
* When switching a task to RT, we may overload the runqueue
@@ -2475,10 +2425,8 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
* then see if we can move to another run queue.
*/
if (task_on_rq_queued(p)) {
-#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
rt_queue_push_tasks(rq);
-#endif /* CONFIG_SMP */
if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
}
@@ -2495,7 +2443,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
return;
if (task_current_donor(rq, p)) {
-#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
* may need to pull tasks to this runqueue.
@@ -2509,11 +2456,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (p->prio > rq->rt.highest_prio.curr)
resched_curr(rq);
-#else
- /* For UP simply resched on drop of prio */
- if (oldprio < p->prio)
- resched_curr(rq);
-#endif /* CONFIG_SMP */
} else {
/*
* This task is not running, but if it is
@@ -2549,9 +2491,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
}
}
}
-#else
+#else /* !CONFIG_POSIX_TIMERS: */
static inline void watchdog(struct rq *rq, struct task_struct *p) { }
-#endif
+#endif /* !CONFIG_POSIX_TIMERS */
/*
* scheduler tick hitting a task of our scheduling class.
@@ -2620,7 +2562,7 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu)
return rt_rq_throttled(rt_rq);
}
-#endif
+#endif /* CONFIG_SCHED_CORE */
DEFINE_SCHED_CLASS(rt) = {
@@ -2634,7 +2576,6 @@ DEFINE_SCHED_CLASS(rt) = {
.put_prev_task = put_prev_task_rt,
.set_next_task = set_next_task_rt,
-#ifdef CONFIG_SMP
.balance = balance_rt,
.select_task_rq = select_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
@@ -2643,7 +2584,6 @@ DEFINE_SCHED_CLASS(rt) = {
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
.find_lock_rq = find_lock_lowest_rq,
-#endif
.task_tick = task_tick_rt,
@@ -2887,7 +2827,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
return 1;
}
-#else /* !CONFIG_RT_GROUP_SCHED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
@@ -2895,7 +2835,7 @@ static int sched_rt_global_constraints(void)
return 0;
}
#endif /* CONFIG_SYSCTL */
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
@@ -2951,6 +2891,12 @@ undo:
sched_domains_mutex_unlock();
mutex_unlock(&mutex);
+ /*
+ * After changing maximum available bandwidth for DEADLINE, we need to
+ * recompute per root domain and per cpus variables accordingly.
+ */
+ rebuild_sched_domains();
+
return ret;
}
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index c529706bed11..6803cfec7a1e 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+#include <linux/types.h>
static const u32 runnable_avg_yN_inv[] __maybe_unused = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 475bb5998295..d3f33d10c58c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -69,6 +69,7 @@
#include <linux/wait_bit.h>
#include <linux/workqueue_api.h>
#include <linux/delayacct.h>
+#include <linux/mmu_context.h>
#include <trace/events/power.h>
#include <trace/events/sched.h>
@@ -384,6 +385,7 @@ extern void dl_server_stop(struct sched_dl_entity *dl_se);
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_server_has_tasks_f has_tasks,
dl_server_pick_f pick_task);
+extern void sched_init_dl_servers(void);
extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p);
@@ -401,6 +403,19 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se)
extern struct list_head task_groups;
+#ifdef CONFIG_CFS_BANDWIDTH
+extern const u64 max_bw_quota_period_us;
+
+/*
+ * default period for group bandwidth.
+ * default: 0.1s, units: microseconds
+ */
+static inline u64 default_bw_period_us(void)
+{
+ return 100000ULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
raw_spinlock_t lock;
@@ -424,7 +439,7 @@ struct cfs_bandwidth {
int nr_burst;
u64 throttled_time;
u64 burst_time;
-#endif
+#endif /* CONFIG_CFS_BANDWIDTH */
};
/* Task group related information */
@@ -442,15 +457,13 @@ struct task_group {
/* runqueue "owned" by this group on each CPU */
struct cfs_rq **cfs_rq;
unsigned long shares;
-#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
* it in its own cache-line separated from the fields above which
* will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned;
-#endif
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
@@ -531,7 +544,7 @@ extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline void free_fair_sched_group(struct task_group *tg) { }
static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
@@ -539,7 +552,7 @@ static inline int alloc_fair_sched_group(struct task_group *tg, struct task_grou
}
static inline void online_fair_sched_group(struct task_group *tg) { }
static inline void unregister_fair_sched_group(struct task_group *tg) { }
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
@@ -573,25 +586,20 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern int sched_group_set_idle(struct task_group *tg, long idle);
-#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
-#else /* !CONFIG_SMP */
-static inline void set_task_rq_fair(struct sched_entity *se,
- struct cfs_rq *prev, struct cfs_rq *next) { }
-#endif /* CONFIG_SMP */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; }
static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; }
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
-#else /* CONFIG_CGROUP_SCHED */
+#else /* !CONFIG_CGROUP_SCHED: */
struct cfs_bandwidth { };
static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* !CONFIG_CGROUP_SCHED */
extern void unregister_rt_sched_group(struct task_group *tg);
extern void free_rt_sched_group(struct task_group *tg);
@@ -667,7 +675,6 @@ struct cfs_rq {
struct sched_entity *curr;
struct sched_entity *next;
-#ifdef CONFIG_SMP
/*
* CFS load tracking
*/
@@ -699,7 +706,6 @@ struct cfs_rq {
u64 last_h_load_update;
struct sched_entity *h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
-#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
@@ -796,19 +802,13 @@ struct rt_rq {
struct rt_prio_array active;
unsigned int rt_nr_running;
unsigned int rr_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
-#ifdef CONFIG_SMP
int next; /* next highest */
-#endif
} highest_prio;
-#endif
-#ifdef CONFIG_SMP
bool overloaded;
struct plist_head pushable_tasks;
-#endif /* CONFIG_SMP */
int rt_queued;
#ifdef CONFIG_RT_GROUP_SCHED
@@ -839,7 +839,6 @@ struct dl_rq {
unsigned int dl_nr_running;
-#ifdef CONFIG_SMP
/*
* Deadline values of the currently executing and the
* earliest ready task on this rq. Caching these facilitates
@@ -859,9 +858,7 @@ struct dl_rq {
* of the leftmost (earliest deadline) element.
*/
struct rb_root_cached pushable_dl_tasks_root;
-#else
- struct dl_bw dl_bw;
-#endif
+
/*
* "Active utilization" for this runqueue: increased when a
* task wakes up (becomes TASK_RUNNING) and decreased when a
@@ -932,7 +929,6 @@ static inline long se_runnable(struct sched_entity *se)
#endif /* !CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_SMP
/*
* XXX we want to get rid of these helpers and use the full load resolution.
*/
@@ -1008,7 +1004,7 @@ struct root_domain {
/* These atomics are updated outside of a lock */
atomic_t rto_loop_next;
atomic_t rto_loop_start;
-#endif
+#endif /* HAVE_RT_PUSH_IPI */
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@@ -1043,7 +1039,6 @@ static inline void set_rd_overloaded(struct root_domain *rd, int status)
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
#endif
-#endif /* CONFIG_SMP */
#ifdef CONFIG_UCLAMP_TASK
/*
@@ -1107,18 +1102,14 @@ struct rq {
unsigned int numa_migrate_on;
#endif
#ifdef CONFIG_NO_HZ_COMMON
-#ifdef CONFIG_SMP
unsigned long last_blocked_load_update_tick;
unsigned int has_blocked_load;
call_single_data_t nohz_csd;
-#endif /* CONFIG_SMP */
unsigned int nohz_tick_stopped;
atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
-#ifdef CONFIG_SMP
unsigned int ttwu_pending;
-#endif
u64 nr_switches;
#ifdef CONFIG_UCLAMP_TASK
@@ -1149,12 +1140,17 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
- unsigned int nr_uninterruptible;
+ unsigned long nr_uninterruptible;
+#ifdef CONFIG_SCHED_PROXY_EXEC
+ struct task_struct __rcu *donor; /* Scheduling context */
+ struct task_struct __rcu *curr; /* Execution context */
+#else
union {
struct task_struct __rcu *donor; /* Scheduler context */
struct task_struct __rcu *curr; /* Execution context */
};
+#endif
struct sched_dl_entity *dl_server;
struct task_struct *idle;
struct task_struct *stop;
@@ -1183,7 +1179,6 @@ struct rq {
int membarrier_state;
#endif
-#ifdef CONFIG_SMP
struct root_domain *rd;
struct sched_domain __rcu *sd;
@@ -1224,7 +1219,6 @@ struct rq {
#ifdef CONFIG_HOTPLUG_CPU
struct rcuwait hotplug_wait;
#endif
-#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
@@ -1242,9 +1236,7 @@ struct rq {
long calc_load_active;
#ifdef CONFIG_SCHED_HRTICK
-#ifdef CONFIG_SMP
call_single_data_t hrtick_csd;
-#endif
struct hrtimer hrtick_timer;
ktime_t hrtick_time;
#endif
@@ -1271,9 +1263,7 @@ struct rq {
struct cpuidle_state *idle_state;
#endif
-#ifdef CONFIG_SMP
unsigned int nr_pinned;
-#endif
unsigned int push_busy;
struct cpu_stop_work push_work;
@@ -1294,12 +1284,12 @@ struct rq {
unsigned int core_forceidle_seq;
unsigned int core_forceidle_occupation;
u64 core_forceidle_start;
-#endif
+#endif /* CONFIG_SCHED_CORE */
/* Scratch cpumask to be temporarily used under rq_lock */
cpumask_var_t scratch_mask;
-#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP)
+#ifdef CONFIG_CFS_BANDWIDTH
call_single_data_t cfsb_csd;
struct list_head cfsb_csd_list;
#endif
@@ -1313,32 +1303,24 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
return cfs_rq->rq;
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
return container_of(cfs_rq, struct rq, cfs);
}
-#endif
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
static inline int cpu_of(struct rq *rq)
{
-#ifdef CONFIG_SMP
return rq->cpu;
-#else
- return 0;
-#endif
}
#define MDF_PUSH 0x01
static inline bool is_migration_disabled(struct task_struct *p)
{
-#ifdef CONFIG_SMP
return p->migration_disabled;
-#else
- return false;
-#endif
}
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -1349,10 +1331,17 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+#ifdef CONFIG_SCHED_PROXY_EXEC
+static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
+{
+ rcu_assign_pointer(rq->donor, t);
+}
+#else
static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
{
/* Do nothing */
}
+#endif
#ifdef CONFIG_SCHED_CORE
static inline struct cpumask *sched_group_span(struct sched_group *sg);
@@ -1500,6 +1489,7 @@ static inline bool sched_group_cookie_match(struct rq *rq,
}
#endif /* !CONFIG_SCHED_CORE */
+
#ifdef CONFIG_RT_GROUP_SCHED
# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED
DECLARE_STATIC_KEY_FALSE(rt_group_sched);
@@ -1507,16 +1497,16 @@ static inline bool rt_group_sched_enabled(void)
{
return static_branch_unlikely(&rt_group_sched);
}
-# else
+# else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */
DECLARE_STATIC_KEY_TRUE(rt_group_sched);
static inline bool rt_group_sched_enabled(void)
{
return static_branch_likely(&rt_group_sched);
}
-# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */
-#else
+# endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */
+#else /* !CONFIG_RT_GROUP_SCHED: */
# define rt_group_sched_enabled() false
-#endif /* CONFIG_RT_GROUP_SCHED */
+#endif /* !CONFIG_RT_GROUP_SCHED */
static inline void lockdep_assert_rq_held(struct rq *rq)
{
@@ -1574,9 +1564,9 @@ static inline void update_idle_core(struct rq *rq)
__update_idle_core(rq);
}
-#else
+#else /* !CONFIG_SCHED_SMT: */
static inline void update_idle_core(struct rq *rq) { }
-#endif
+#endif /* !CONFIG_SCHED_SMT */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1757,7 +1747,7 @@ static inline void scx_rq_clock_invalidate(struct rq *rq)
WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID);
}
-#else /* !CONFIG_SCHED_CLASS_EXT */
+#else /* !CONFIG_SCHED_CLASS_EXT: */
#define scx_enabled() false
#define scx_switched_all() false
@@ -1781,9 +1771,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
-#ifdef CONFIG_SMP
WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
-#endif
}
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
@@ -1961,8 +1949,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
#endif /* !CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_SMP
-
static inline void
queue_balance_callback(struct rq *rq,
struct balance_callback *head,
@@ -2128,8 +2114,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
return p->user_cpus_ptr;
}
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_CGROUP_SCHED
/*
@@ -2174,7 +2158,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
tg = &root_task_group;
p->rt.rt_rq = tg->rt_rq[cpu];
p->rt.parent = tg->rt_se[cpu];
-#endif
+#endif /* CONFIG_RT_GROUP_SCHED */
}
#else /* !CONFIG_CGROUP_SCHED: */
@@ -2200,7 +2184,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
smp_wmb();
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
p->wake_cpu = cpu;
-#endif
+#endif /* CONFIG_SMP */
}
/*
@@ -2278,13 +2262,17 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p)
return rq->donor == p;
}
+static inline bool task_is_blocked(struct task_struct *p)
+{
+ if (!sched_proxy_exec())
+ return false;
+
+ return !!p->blocked_on;
+}
+
static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
{
-#ifdef CONFIG_SMP
return p->on_cpu;
-#else
- return task_current(rq, p);
-#endif
}
static inline int task_on_rq_queued(struct task_struct *p)
@@ -2307,11 +2295,9 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
#define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */
-#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
static_assert(WF_FORK == SD_BALANCE_FORK);
static_assert(WF_TTWU == SD_BALANCE_WAKE);
-#endif
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -2367,11 +2353,7 @@ extern const u32 sched_prio_to_wmult[40];
#define ENQUEUE_HEAD 0x10
#define ENQUEUE_REPLENISH 0x20
-#ifdef CONFIG_SMP
#define ENQUEUE_MIGRATED 0x40
-#else
-#define ENQUEUE_MIGRATED 0x00
-#endif
#define ENQUEUE_INITIAL 0x80
#define ENQUEUE_MIGRATING 0x100
#define ENQUEUE_DELAYED 0x200
@@ -2416,7 +2398,6 @@ struct sched_class {
void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
-#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
@@ -2429,7 +2410,6 @@ struct sched_class {
void (*rq_offline)(struct rq *rq);
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
-#endif
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
void (*task_fork)(struct task_struct *p);
@@ -2487,7 +2467,7 @@ static inline void put_prev_set_next_task(struct rq *rq,
struct task_struct *prev,
struct task_struct *next)
{
- WARN_ON_ONCE(rq->curr != prev);
+ WARN_ON_ONCE(rq->donor != prev);
__put_prev_set_next_dl_server(rq, prev, next);
@@ -2581,8 +2561,6 @@ extern struct task_struct *pick_task_idle(struct rq *rq);
#define SCA_MIGRATE_ENABLE 0x04
#define SCA_USER 0x08
-#ifdef CONFIG_SMP
-
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void sched_balance_trigger(struct rq *rq);
@@ -2634,26 +2612,6 @@ static inline struct task_struct *get_push_task(struct rq *rq)
extern int push_cpu_stop(void *arg);
-#else /* !CONFIG_SMP: */
-
-static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu)
-{
- return true;
-}
-
-static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- struct affinity_context *ctx)
-{
- return set_cpus_allowed_ptr(p, ctx->new_mask);
-}
-
-static inline cpumask_t *alloc_user_cpus_ptr(int node)
-{
- return NULL;
-}
-
-#endif /* !CONFIG_SMP */
-
#ifdef CONFIG_CPU_IDLE
static inline void idle_set_state(struct rq *rq,
@@ -2749,10 +2707,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
call_trace_sched_update_nr_running(rq, count);
}
-#ifdef CONFIG_SMP
if (prev_nr < 2 && rq->nr_running >= 2)
set_rd_overloaded(rq->rd, 1);
-#endif
sched_update_tick_dependency(rq);
}
@@ -2918,10 +2874,7 @@ unsigned long arch_scale_freq_capacity(int cpu)
static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
{
rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
- /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
-#ifdef CONFIG_SMP
rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
-#endif
}
#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
@@ -2930,8 +2883,6 @@ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock
{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
_lock; return _t; }
-#ifdef CONFIG_SMP
-
static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
{
#ifdef CONFIG_SCHED_CORE
@@ -2954,7 +2905,7 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2)
/*
* __sched_core_flip() relies on SMT having cpu-id lock order.
*/
-#endif
+#endif /* CONFIG_SCHED_CORE */
return rq1->cpu < rq2->cpu;
}
@@ -3091,42 +3042,6 @@ extern void set_rq_offline(struct rq *rq);
extern bool sched_smp_initialized;
-#else /* !CONFIG_SMP: */
-
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- WARN_ON_ONCE(!irqs_disabled());
- WARN_ON_ONCE(rq1 != rq2);
- raw_spin_rq_lock(rq1);
- __acquire(rq2->lock); /* Fake it out ;) */
- double_rq_clock_clear_update(rq1, rq2);
-}
-
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- __releases(rq1->lock)
- __releases(rq2->lock)
-{
- WARN_ON_ONCE(rq1 != rq2);
- raw_spin_rq_unlock(rq1);
- __release(rq2->lock);
-}
-
-#endif /* !CONFIG_SMP */
-
DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
double_rq_lock(_T->lock, _T->lock2),
double_rq_unlock(_T->lock, _T->lock2))
@@ -3145,6 +3060,7 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
+
#ifdef CONFIG_NUMA_BALANCING
extern void show_numa_stats(struct task_struct *p, struct seq_file *m);
extern void
@@ -3184,7 +3100,7 @@ extern void nohz_balance_exit_idle(struct rq *rq);
static inline void nohz_balance_exit_idle(struct rq *rq) { }
#endif /* !CONFIG_NO_HZ_COMMON */
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#ifdef CONFIG_NO_HZ_COMMON
extern void nohz_run_idle_balance(int cpu);
#else
static inline void nohz_run_idle_balance(int cpu) { }
@@ -3254,14 +3170,14 @@ static inline u64 irq_time_read(int cpu)
return total;
}
-#else
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline int irqtime_enabled(void)
{
return 0;
}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
@@ -3310,8 +3226,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { }
# define arch_scale_freq_invariant() false
#endif
-#ifdef CONFIG_SMP
-
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long *min,
unsigned long *max);
@@ -3355,10 +3269,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
return READ_ONCE(rq->avg_rt.util_avg);
}
-#else /* !CONFIG_SMP */
-static inline bool update_other_load_avgs(struct rq *rq) { return false; }
-#endif /* CONFIG_SMP */
-
#ifdef CONFIG_UCLAMP_TASK
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
@@ -3535,13 +3445,13 @@ static inline bool sched_energy_enabled(void)
return static_branch_unlikely(&sched_energy_present);
}
-#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
+#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */
#define perf_domain_span(pd) NULL
static inline bool sched_energy_enabled(void) { return false; }
-#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
#ifdef CONFIG_MEMBARRIER
@@ -3567,7 +3477,7 @@ static inline void membarrier_switch_mm(struct rq *rq,
WRITE_ONCE(rq->membarrier_state, membarrier_state);
}
-#else /* !CONFIG_MEMBARRIER :*/
+#else /* !CONFIG_MEMBARRIER: */
static inline void membarrier_switch_mm(struct rq *rq,
struct mm_struct *prev_mm,
@@ -3577,7 +3487,6 @@ static inline void membarrier_switch_mm(struct rq *rq,
#endif /* !CONFIG_MEMBARRIER */
-#ifdef CONFIG_SMP
static inline bool is_per_cpu_kthread(struct task_struct *p)
{
if (!(p->flags & PF_KTHREAD))
@@ -3588,7 +3497,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
return true;
}
-#endif
extern void swake_up_all_locked(struct swait_queue_head *q);
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
@@ -3887,7 +3795,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
-#ifdef CONFIG_SMP
static inline
void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
{
@@ -3908,7 +3815,6 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu)
return false;
}
-#endif
#ifdef CONFIG_RT_MUTEXES
@@ -3949,21 +3855,8 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio);
-#ifdef CONFIG_SMP
extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
-#else
-
-static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
-{
- return NULL;
-}
-
-static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
-{
-}
-
-#endif
#ifdef CONFIG_SCHED_CLASS_EXT
/*
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 21ac44428bb0..7f151d96dba9 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -1,8 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_SCHED_SMP_H
+#define _KERNEL_SCHED_SMP_H
+
/*
* Scheduler internal SMP callback types and methods between the scheduler
* and other internal parts of the core kernel:
*/
+#include <linux/types.h>
extern void sched_ttwu_pending(void *arg);
@@ -13,3 +18,5 @@ extern void flush_smp_call_function_queue(void);
#else
static inline void flush_smp_call_function_queue(void) { }
#endif
+
+#endif /* _KERNEL_SCHED_SMP_H */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 4346fd81c31f..d1c9429a4ac5 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -2,6 +2,7 @@
/*
* /proc/schedstat implementation
*/
+#include "sched.h"
void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
@@ -114,10 +115,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "timestamp %lu\n", jiffies);
} else {
struct rq *rq;
-#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcount = 0;
-#endif
cpu = (unsigned long)(v - 2);
rq = cpu_rq(cpu);
@@ -132,7 +131,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
-#ifdef CONFIG_SMP
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -163,7 +161,6 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->ttwu_move_balance);
}
rcu_read_unlock();
-#endif
}
return 0;
}
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 452826df6ae1..26f3fd4d34ce 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -112,10 +112,10 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
-#else
+#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
struct task_struct *prev) {}
-#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -220,7 +220,7 @@ static inline void psi_sched_switch(struct task_struct *prev,
psi_task_switch(prev, next, sleep);
}
-#else /* CONFIG_PSI */
+#else /* !CONFIG_PSI: */
static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
@@ -229,7 +229,7 @@ static inline void psi_sched_switch(struct task_struct *prev,
bool sleep) {}
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
struct task_struct *prev) {}
-#endif /* CONFIG_PSI */
+#endif /* !CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
/*
@@ -334,6 +334,6 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n
# define sched_info_enqueue(rq, t) do { } while (0)
# define sched_info_dequeue(rq, t) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
-#endif /* CONFIG_SCHED_INFO */
+#endif /* !CONFIG_SCHED_INFO */
#endif /* _KERNEL_STATS_H */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 058dd42e3d9b..2d4e279f05ee 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -7,8 +7,8 @@
*
* See kernel/stop_machine.c
*/
+#include "sched.h"
-#ifdef CONFIG_SMP
static int
select_task_rq_stop(struct task_struct *p, int cpu, int flags)
{
@@ -20,7 +20,6 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
return sched_stop_runnable(rq);
}
-#endif /* CONFIG_SMP */
static void
wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags)
@@ -106,11 +105,9 @@ DEFINE_SCHED_CLASS(stop) = {
.put_prev_task = put_prev_task_stop,
.set_next_task = set_next_task_stop,
-#ifdef CONFIG_SMP
.balance = balance_stop,
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
-#endif
.task_tick = task_tick_stop,
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 72505cd3b60a..0fef6496c4c8 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -2,6 +2,7 @@
/*
* <linux/swait.h> (simple wait queues ) implementation:
*/
+#include "sched.h"
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 547c1f05b667..77ae87f36e84 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -174,7 +174,7 @@ SYSCALL_DEFINE1(nice, int, increment)
return 0;
}
-#endif
+#endif /* __ARCH_WANT_SYS_NICE */
/**
* task_prio - return the priority value of a given task.
@@ -209,10 +209,8 @@ int idle_cpu(int cpu)
if (rq->nr_running)
return 0;
-#ifdef CONFIG_SMP
if (rq->ttwu_pending)
return 0;
-#endif
return 1;
}
@@ -255,8 +253,7 @@ int sched_core_idle_cpu(int cpu)
return idle_cpu(cpu);
}
-
-#endif
+#endif /* CONFIG_SCHED_CORE */
/**
* find_process_by_pid - find a process with a matching PID value.
@@ -448,7 +445,7 @@ static inline int uclamp_validate(struct task_struct *p,
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
-#endif
+#endif /* !CONFIG_UCLAMP_TASK */
/*
* Allow unprivileged RT tasks to decrease priority.
@@ -642,7 +639,6 @@ change:
goto unlock;
}
#endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_SMP
if (dl_bandwidth_enabled() && dl_policy(policy) &&
!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
@@ -658,7 +654,6 @@ change:
goto unlock;
}
}
-#endif
}
/* Re-check policy now with rq lock held: */
@@ -1120,7 +1115,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
}
-#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
/*
@@ -1149,7 +1143,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
return 0;
}
-#endif /* CONFIG_SMP */
int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
{
@@ -1242,7 +1235,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
if (user_mask) {
cpumask_copy(user_mask, in_mask);
- } else if (IS_ENABLED(CONFIG_SMP)) {
+ } else {
return -ENOMEM;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b958fe48e020..977e133bb8a4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,7 +3,9 @@
* Scheduler topology setup/handling methods
*/
+#include <linux/sched/isolation.h>
#include <linux/bsearch.h>
+#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
void sched_domains_mutex_lock(void)
@@ -87,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!(sd->flags & SD_OVERLAP) &&
+ if (!(sd->flags & SD_NUMA) &&
cpumask_intersects(groupmask, sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
@@ -100,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
group->sgc->id,
cpumask_pr_args(sched_group_span(group)));
- if ((sd->flags & SD_OVERLAP) &&
+ if ((sd->flags & SD_NUMA) &&
!cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
printk(KERN_CONT " mask=%*pbl",
cpumask_pr_args(group_balance_mask(group)));
@@ -313,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void)
}
late_initcall(sched_energy_aware_sysctl_init);
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
static void free_pd(struct perf_domain *pd)
{
@@ -449,9 +451,9 @@ free:
return false;
}
-#else
+#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */
static void free_pd(struct perf_domain *pd) { }
-#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
+#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
static void free_rootdomain(struct rcu_head *rcu)
{
@@ -1318,8 +1320,6 @@ next:
update_group_capacity(sd, cpu);
}
-#ifdef CONFIG_SMP
-
/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */
void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
{
@@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
* "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
* which is shared by all the overlapping groups.
*/
- WARN_ON_ONCE(sd->flags & SD_OVERLAP);
+ WARN_ON_ONCE(sd->flags & SD_NUMA);
sg = sd->groups;
if (cpu != sg->asym_prefer_cpu) {
@@ -1374,8 +1374,6 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
}
}
-#endif /* CONFIG_SMP */
-
/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
@@ -1598,7 +1596,7 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
-#endif
+#endif /* CONFIG_NUMA */
/*
* SD_flags allowed in topology descriptions.
@@ -1714,7 +1712,7 @@ sd_init(struct sched_domain_topology_level *tl,
SD_WAKE_AFFINE);
}
-#endif
+#endif /* CONFIG_NUMA */
} else {
sd->cache_nice_tries = 1;
}
@@ -1739,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl,
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT),
#endif
#ifdef CONFIG_SCHED_CLUSTER
- { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+ SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS),
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC),
#endif
- { cpu_cpu_mask, SD_INIT_NAME(PKG) },
+ SDTL_INIT(cpu_cpu_mask, NULL, PKG),
{ NULL, },
};
@@ -2010,23 +2008,14 @@ void sched_init_numa(int offline_node)
/*
* Add the NUMA identity distance, aka single NODE.
*/
- tl[i++] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .numa_level = 0,
- SD_INIT_NAME(NODE)
- };
+ tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE);
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 1; j < nr_levels; i++, j++) {
- tl[i] = (struct sched_domain_topology_level){
- .mask = sd_numa_mask,
- .sd_flags = cpu_numa_flags,
- .flags = SDTL_OVERLAP,
- .numa_level = j,
- SD_INIT_NAME(NUMA)
- };
+ tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
+ tl[i].numa_level = j;
}
sched_domain_topology_saved = sched_domain_topology;
@@ -2337,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sd) {
sd = *per_cpu_ptr(sdd->sd, j);
- if (sd && (sd->flags & SD_OVERLAP))
+ if (sd && (sd->flags & SD_NUMA))
free_sched_groups(sd->groups, 0);
kfree(*per_cpu_ptr(sdd->sd, j));
}
@@ -2403,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
id_seen = sched_domains_tmpmask2;
for_each_sd_topology(tl) {
+ int tl_common_flags = 0;
+
+ if (tl->sd_flags)
+ tl_common_flags = (*tl->sd_flags)();
/* NUMA levels are allowed to overlap */
- if (tl->flags & SDTL_OVERLAP)
+ if (tl_common_flags & SD_NUMA)
continue;
cpumask_clear(covered);
@@ -2476,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
- if (tl->flags & SDTL_OVERLAP)
- sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
@@ -2490,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
sd->span_weight = cpumask_weight(sched_domain_span(sd));
- if (sd->flags & SD_OVERLAP) {
+ if (sd->flags & SD_NUMA) {
if (build_overlap_sched_groups(sd, i))
goto error;
} else {
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 51e38f5f4701..a6f00012c72e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -4,6 +4,7 @@
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
+#include "sched.h"
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index b410b61cec95..1088d3b7012c 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,5 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/sched/debug.h>
+#include "sched.h"
+
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
diff --git a/kernel/signal.c b/kernel/signal.c
index 148082db9a55..e2c928de7d2c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3016,7 +3016,7 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(&ksig->info);
+ vfs_coredump(&ksig->info);
}
/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 974f3a3962e8..4649fa4872ff 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -86,13 +86,15 @@ int smpcfd_dead_cpu(unsigned int cpu)
int smpcfd_dying_cpu(unsigned int cpu)
{
/*
- * The IPIs for the smp-call-function callbacks queued by other
- * CPUs might arrive late, either due to hardware latencies or
- * because this CPU disabled interrupts (inside stop-machine)
- * before the IPIs were sent. So flush out any pending callbacks
- * explicitly (without waiting for the IPIs to arrive), to
- * ensure that the outgoing CPU doesn't go offline with work
- * still pending.
+ * The IPIs for the smp-call-function callbacks queued by other CPUs
+ * might arrive late, either due to hardware latencies or because this
+ * CPU disabled interrupts (inside stop-machine) before the IPIs were
+ * sent. So flush out any pending callbacks explicitly (without waiting
+ * for the IPIs to arrive), to ensure that the outgoing CPU doesn't go
+ * offline with work still pending.
+ *
+ * This runs with interrupts disabled inside the stopper task invoked by
+ * stop_machine(), ensuring mutually exclusive CPU offlining and IPI flush.
*/
__flush_smp_call_function_queue(false);
irq_work_run();
@@ -418,6 +420,10 @@ void __smp_call_single_queue(int cpu, struct llist_node *node)
*/
static int generic_exec_single(int cpu, call_single_data_t *csd)
{
+ /*
+ * Preemption already disabled here so stopper cannot run on this CPU,
+ * ensuring mutually exclusive CPU offlining and last IPI flush.
+ */
if (cpu == smp_processor_id()) {
smp_call_func_t func = csd->func;
void *info = csd->info;
@@ -638,8 +644,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int err;
/*
- * prevent preemption and reschedule on another processor,
- * as well as CPU removal
+ * Prevent preemption and reschedule on another CPU, as well as CPU
+ * removal. This prevents stopper from running on this CPU, thus
+ * providing mutual exclusion of the below cpu_online() check and
+ * IPI sending ensuring IPI are not missed by CPU going offline.
*/
this_cpu = get_cpu();
@@ -741,32 +749,19 @@ EXPORT_SYMBOL_GPL(smp_call_function_single_async);
*
* Selection preference:
* 1) current cpu if in @mask
- * 2) any cpu of current node if in @mask
- * 3) any other online cpu in @mask
+ * 2) nearest cpu in @mask, based on NUMA topology
*/
int smp_call_function_any(const struct cpumask *mask,
smp_call_func_t func, void *info, int wait)
{
unsigned int cpu;
- const struct cpumask *nodemask;
int ret;
/* Try for same CPU (cheapest) */
cpu = get_cpu();
- if (cpumask_test_cpu(cpu, mask))
- goto call;
-
- /* Try for same node. */
- nodemask = cpumask_of_node(cpu_to_node(cpu));
- for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
- cpu = cpumask_next_and(cpu, nodemask, mask)) {
- if (cpu_online(cpu))
- goto call;
- }
+ if (!cpumask_test_cpu(cpu, mask))
+ cpu = sched_numa_find_nth_cpu(mask, 0, cpu_to_node(cpu));
- /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
- cpu = cpumask_any_and(mask, cpu_online_mask);
-call:
ret = smp_call_function_single(cpu, func, info, wait);
put_cpu();
return ret;
@@ -792,7 +787,6 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
bool wait = scf_flags & SCF_WAIT;
int nr_cpus = 0;
bool run_remote = false;
- bool run_local = false;
lockdep_assert_preemption_disabled();
@@ -814,19 +808,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
*/
WARN_ON_ONCE(!in_task());
- /* Check if we need local execution. */
- if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) &&
- (!cond_func || cond_func(this_cpu, info)))
- run_local = true;
-
/* Check if we need remote execution, i.e., any CPU excluding this one. */
- cpu = cpumask_first_and(mask, cpu_online_mask);
- if (cpu == this_cpu)
- cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
- if (cpu < nr_cpu_ids)
- run_remote = true;
-
- if (run_remote) {
+ if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) {
cfd = this_cpu_ptr(&cfd_data);
cpumask_and(cfd->cpumask, mask, cpu_online_mask);
__cpumask_clear_cpu(this_cpu, cfd->cpumask);
@@ -840,6 +823,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
continue;
}
+ /* Work is enqueued on a remote CPU. */
+ run_remote = true;
+
csd_lock(csd);
if (wait)
csd->node.u_flags |= CSD_TYPE_SYNC;
@@ -851,6 +837,10 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
#endif
trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
+ /*
+ * Kick the remote CPU if this is the first work
+ * item enqueued.
+ */
if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
nr_cpus++;
@@ -869,7 +859,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
send_call_function_ipi_mask(cfd->cpumask_ipi);
}
- if (run_local) {
+ /* Check if we need local execution. */
+ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) &&
+ (!cond_func || cond_func(this_cpu, info))) {
unsigned long flags;
local_irq_save(flags);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 1992b62e980b..4503b60ce9bd 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -18,8 +18,6 @@
#include "smpboot.h"
-#ifdef CONFIG_SMP
-
#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
/*
* For the hotplug case we keep the task structs around and reuse
@@ -76,8 +74,6 @@ void __init idle_threads_init(void)
}
#endif
-#endif /* #ifdef CONFIG_SMP */
-
static LIST_HEAD(hotplug_threads);
static DEFINE_MUTEX(smpboot_threads_lock);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5d2d0562115b..3fe6b0c99f3d 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
}
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
- struct cpu_stop_work *work,
- struct wake_q_head *wakeq)
+ struct cpu_stop_work *work)
{
list_add_tail(&work->list, &stopper->works);
- wake_q_add(wakeq, stopper->thread);
}
/* queue @work to @stopper. if offline, @work is completed immediately */
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- DEFINE_WAKE_Q(wakeq);
unsigned long flags;
bool enabled;
@@ -101,12 +98,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
raw_spin_lock_irqsave(&stopper->lock, flags);
enabled = stopper->enabled;
if (enabled)
- __cpu_stop_queue_work(stopper, work, &wakeq);
+ __cpu_stop_queue_work(stopper, work);
else if (work->done)
cpu_stop_signal_done(work->done);
raw_spin_unlock_irqrestore(&stopper->lock, flags);
- wake_up_q(&wakeq);
+ if (enabled)
+ wake_up_process(stopper->thread);
preempt_enable();
return enabled;
@@ -264,7 +262,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
{
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
- DEFINE_WAKE_Q(wakeq);
int err;
retry:
@@ -300,8 +297,8 @@ retry:
}
err = 0;
- __cpu_stop_queue_work(stopper1, work1, &wakeq);
- __cpu_stop_queue_work(stopper2, work2, &wakeq);
+ __cpu_stop_queue_work(stopper1, work1);
+ __cpu_stop_queue_work(stopper2, work2);
unlock:
raw_spin_unlock(&stopper2->lock);
@@ -316,7 +313,10 @@ unlock:
goto retry;
}
- wake_up_q(&wakeq);
+ if (!err) {
+ wake_up_process(stopper1->thread);
+ wake_up_process(stopper2->thread);
+ }
preempt_enable();
return err;
diff --git a/kernel/sys.c b/kernel/sys.c
index adc0de0aa364..18a037cc6f61 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -181,6 +181,35 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);
+static const struct ctl_table overflow_sysctl_table[] = {
+ {
+ .procname = "overflowuid",
+ .data = &overflowuid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
+ },
+ {
+ .procname = "overflowgid",
+ .data = &overflowgid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
+ },
+};
+
+static int __init init_overflow_sysctl(void)
+{
+ register_sysctl_init("kernel", overflow_sysctl_table);
+ return 0;
+}
+
+postcore_initcall(init_overflow_sysctl);
+
/*
* Returns true if current's euid is same as p's uid or euid,
* or has CAP_SYS_NICE to p's user_ns.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9b4f0cff76ea..cb6196e3fa99 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1,69 +1,28 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* sysctl.c: General linux system control interface
- *
- * Begun 24 March 1995, Stephen Tweedie
- * Added /proc support, Dec 1995
- * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas.
- * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver.
- * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver.
- * Dynamic registration fixes, Stephen Tweedie.
- * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn.
- * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris
- * Horn.
- * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer.
- * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer.
- * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill
- * Wendling.
- * The list_for_each() macro wasn't appropriate for the sysctl loop.
- * Removed it and replaced it with older style, 03/23/00, Bill Wendling
*/
-#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/bitmap.h>
-#include <linux/printk.h>
#include <linux/proc_fs.h>
-#include <linux/security.h>
#include <linux/ctype.h>
-#include <linux/filter.h>
-#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kobject.h>
-#include <linux/net.h>
-#include <linux/sysrq.h>
#include <linux/highuid.h>
#include <linux/writeback.h>
-#include <linux/ratelimit.h>
#include <linux/initrd.h>
-#include <linux/key.h>
#include <linux/times.h>
#include <linux/limits.h>
#include <linux/syscalls.h>
-#include <linux/nfs_fs.h>
-#include <linux/acpi.h>
-#include <linux/reboot.h>
-#include <linux/kmod.h>
#include <linux/capability.h>
-#include <linux/binfmts.h>
-#include <linux/sched/sysctl.h>
-#include <linux/mount.h>
-#include <linux/pid.h>
#include "../lib/kstrtox.h"
#include <linux/uaccess.h>
#include <asm/processor.h>
-#ifdef CONFIG_X86
-#include <asm/nmi.h>
-#include <asm/io.h>
-#endif
-#ifdef CONFIG_RT_MUTEXES
-#include <linux/rtmutex.h>
-#endif
-
/* shared constants to be used in various sysctls */
const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
EXPORT_SYMBOL(sysctl_vals);
@@ -736,49 +695,6 @@ int proc_douintvec(const struct ctl_table *table, int write, void *buffer,
do_proc_douintvec_conv, NULL);
}
-/*
- * Taint values can only be increased
- * This means we can safely use a temporary.
- */
-static int proc_taint(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- struct ctl_table t;
- unsigned long tmptaint = get_taint();
- int err;
-
- if (write && !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- t = *table;
- t.data = &tmptaint;
- err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
- if (err < 0)
- return err;
-
- if (write) {
- int i;
-
- /*
- * If we are relying on panic_on_taint not producing
- * false positives due to userspace input, bail out
- * before setting the requested taint flags.
- */
- if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint))
- return -EINVAL;
-
- /*
- * Poor man's atomic or. Not worth adding a primitive
- * to everyone's atomic.h for this
- */
- for (i = 0; i < TAINT_FLAGS_COUNT; i++)
- if ((1UL << i) & tmptaint)
- add_taint(i, LOCKDEP_STILL_OK);
- }
-
- return err;
-}
-
/**
* struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure
* @min: pointer to minimum allowable value
@@ -968,26 +884,6 @@ int proc_dou8vec_minmax(const struct ctl_table *table, int write,
}
EXPORT_SYMBOL_GPL(proc_dou8vec_minmax);
-#ifdef CONFIG_MAGIC_SYSRQ
-static int sysrq_sysctl_handler(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
-{
- int tmp, ret;
-
- tmp = sysrq_mask();
-
- ret = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (ret || !write)
- return ret;
-
- if (write)
- sysrq_toggle_support(tmp);
-
- return 0;
-}
-#endif
-
static int __do_proc_doulongvec_minmax(void *data,
const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos,
@@ -1292,28 +1188,6 @@ int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buf
do_proc_dointvec_ms_jiffies_conv, NULL);
}
-static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer,
- size_t *lenp, loff_t *ppos)
-{
- struct pid *new_pid;
- pid_t tmp;
- int r;
-
- tmp = pid_vnr(cad_pid);
-
- r = __do_proc_dointvec(&tmp, table, write, buffer,
- lenp, ppos, NULL, NULL);
- if (r || !write)
- return r;
-
- new_pid = find_get_pid(tmp);
- if (!new_pid)
- return -ESRCH;
-
- put_pid(xchg(&cad_pid, new_pid));
- return 0;
-}
-
/**
* proc_do_large_bitmap - read/write from/to a large bitmap
* @table: the sysctl table
@@ -1580,15 +1454,9 @@ int proc_do_static_key(const struct ctl_table *table, int write,
return ret;
}
-static const struct ctl_table kern_table[] = {
+static const struct ctl_table sysctl_subsys_table[] = {
#ifdef CONFIG_PROC_SYSCTL
{
- .procname = "tainted",
- .maxlen = sizeof(long),
- .mode = 0644,
- .proc_handler = proc_taint,
- },
- {
.procname = "sysctl_writes_strict",
.data = &sysctl_writes_strict,
.maxlen = sizeof(int),
@@ -1598,95 +1466,6 @@ static const struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
-#ifdef CONFIG_PARISC
- {
- .procname = "soft-power",
- .data = &pwrsw_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
- {
- .procname = "unaligned-trap",
- .data = &unaligned_enabled,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_MODULES
- {
- .procname = "modprobe",
- .data = &modprobe_path,
- .maxlen = KMOD_PATH_LEN,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
- {
- .procname = "modules_disabled",
- .data = &modules_disabled,
- .maxlen = sizeof(int),
- .mode = 0644,
- /* only handle a transition from default "0" to "1" */
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_ONE,
- },
-#endif
-#ifdef CONFIG_UEVENT_HELPER
- {
- .procname = "hotplug",
- .data = &uevent_helper,
- .maxlen = UEVENT_HELPER_PATH_LEN,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
-#endif
-#ifdef CONFIG_MAGIC_SYSRQ
- {
- .procname = "sysrq",
- .data = NULL,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = sysrq_sysctl_handler,
- },
-#endif
-#ifdef CONFIG_PROC_SYSCTL
- {
- .procname = "cad_pid",
- .data = NULL,
- .maxlen = sizeof (int),
- .mode = 0600,
- .proc_handler = proc_do_cad_pid,
- },
-#endif
- {
- .procname = "threads-max",
- .data = NULL,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = sysctl_max_threads,
- },
- {
- .procname = "overflowuid",
- .data = &overflowuid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_MAXOLDUID,
- },
- {
- .procname = "overflowgid",
- .data = &overflowgid,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_MAXOLDUID,
- },
{
.procname = "ngroups_max",
.data = (void *)&ngroups_max,
@@ -1701,20 +1480,10 @@ static const struct ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = proc_dointvec,
},
-#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
- defined(CONFIG_DEBUG_STACKOVERFLOW)
- {
- .procname = "panic_on_stackoverflow",
- .data = &sysctl_panic_on_stackoverflow,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#if defined(CONFIG_MMU)
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
{
- .procname = "randomize_va_space",
- .data = &randomize_va_space,
+ .procname = "unaligned-trap",
+ .data = &unaligned_enabled,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
@@ -1729,40 +1498,11 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_RT_MUTEXES
- {
- .procname = "max_lock_depth",
- .data = &max_lock_depth,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-#ifdef CONFIG_TREE_RCU
- {
- .procname = "panic_on_rcu_stall",
- .data = &sysctl_panic_on_rcu_stall,
- .maxlen = sizeof(sysctl_panic_on_rcu_stall),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "max_rcu_stall_to_panic",
- .data = &sysctl_max_rcu_stall_to_panic,
- .maxlen = sizeof(sysctl_max_rcu_stall_to_panic),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_INT_MAX,
- },
-#endif
};
int __init sysctl_init_bases(void)
{
- register_sysctl_init("kernel", kern_table);
+ register_sysctl_init("kernel", sysctl_subsys_table);
return 0;
}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b0b97a60aaa6..7c6a52f7836c 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE
help
Tracks idle state on behalf of RCU.
-if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
+if GENERIC_CLOCKEVENTS
# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
# only related to the tick functionality. Oneshot clockevent devices
# are supported independent of this.
@@ -208,6 +208,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
interval and NTP's maximum frequency drift of 500 parts
per million. If the clocksource is good enough for NTP,
it is good enough for the clocksource watchdog!
+endif
+
+config POSIX_AUX_CLOCKS
+ bool "Enable auxiliary POSIX clocks"
+ depends on POSIX_TIMERS
+ help
+ Auxiliary POSIX clocks are clocks which can be steered
+ independently of the core timekeeper, which controls the
+ MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to
+ provide e.g. lockless time accessors to independent PTP clocks
+ and other clock domains, which are not correlated to the TAI/NTP
+ notion of time.
endmenu
-endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6a8bc7da9062..e400fe150f9d 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -323,9 +323,7 @@ static void clocksource_verify_choose_cpus(void)
return;
/* Make sure to select at least one CPU other than the current CPU. */
- cpu = cpumask_first(cpu_online_mask);
- if (cpu == smp_processor_id())
- cpu = cpumask_next(cpu, cpu_online_mask);
+ cpu = cpumask_any_but(cpu_online_mask, smp_processor_id());
if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
return;
cpumask_set_cpu(cpu, &cpus_chosen);
@@ -589,9 +587,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* Cycle through CPUs to check if the CPUs stay synchronized
* to each other.
*/
- next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(cpu_online_mask);
+ next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask);
/*
* Arm timer if not already pending: could race with concurrent
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index e3642278df43..667452768ed3 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -242,6 +242,11 @@ static void timens_set_vvar_page(struct task_struct *task,
for (i = 0; i < CS_BASES; i++)
timens_setup_vdso_clock_data(&vc[i], ns);
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
+ for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
+ timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
+ }
+
out:
mutex_unlock(&offset_lock);
}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b837d3d9d325..97fa99b96dd0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/rtc.h>
#include <linux/audit.h>
+#include <linux/timekeeper_internal.h>
#include "ntp_internal.h"
#include "timekeeping_internal.h"
@@ -86,14 +87,16 @@ struct ntp_data {
#endif
};
-static struct ntp_data tk_ntp_data = {
- .tick_usec = USER_TICK_USEC,
- .time_state = TIME_OK,
- .time_status = STA_UNSYNC,
- .time_constant = 2,
- .time_maxerror = NTP_PHASE_LIMIT,
- .time_esterror = NTP_PHASE_LIMIT,
- .ntp_next_leap_sec = TIME64_MAX,
+static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = {
+ [ 0 ... TIMEKEEPERS_MAX - 1 ] = {
+ .tick_usec = USER_TICK_USEC,
+ .time_state = TIME_OK,
+ .time_status = STA_UNSYNC,
+ .time_constant = 2,
+ .time_maxerror = NTP_PHASE_LIMIT,
+ .time_esterror = NTP_PHASE_LIMIT,
+ .ntp_next_leap_sec = TIME64_MAX,
+ },
};
#define SECS_PER_DAY 86400
@@ -300,7 +303,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset)
* Select how the frequency is to be controlled
* and in which mode (PLL or FLL).
*/
- real_secs = __ktime_get_real_seconds();
+ real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data);
secs = (long)(real_secs - ntpdata->time_reftime);
if (unlikely(ntpdata->time_status & STA_FREQHOLD))
secs = 0;
@@ -348,33 +351,38 @@ static void __ntp_clear(struct ntp_data *ntpdata)
/**
* ntp_clear - Clears the NTP state variables
+ * @tkid: Timekeeper ID to be able to select proper ntp data array member
*/
-void ntp_clear(void)
+void ntp_clear(unsigned int tkid)
{
- __ntp_clear(&tk_ntp_data);
+ __ntp_clear(&tk_ntp_data[tkid]);
}
-u64 ntp_tick_length(void)
+u64 ntp_tick_length(unsigned int tkid)
{
- return tk_ntp_data.tick_length;
+ return tk_ntp_data[tkid].tick_length;
}
/**
* ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ * @tkid: Timekeeper ID
*
- * Provides the time of the next leapsecond against CLOCK_REALTIME in
- * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next
+ * leap second against CLOCK_REALTIME in a ktime_t format if a
+ * leap second is pending. KTIME_MAX otherwise.
*/
-ktime_t ntp_get_next_leap(void)
+ktime_t ntp_get_next_leap(unsigned int tkid)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
- ktime_t ret;
+ struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE];
+
+ if (tkid != TIMEKEEPER_CORE)
+ return KTIME_MAX;
if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS))
return ktime_set(ntpdata->ntp_next_leap_sec, 0);
- ret = KTIME_MAX;
- return ret;
+
+ return KTIME_MAX;
}
/*
@@ -387,9 +395,9 @@ ktime_t ntp_get_next_leap(void)
*
* Also handles leap second processing, and returns leap offset
*/
-int second_overflow(time64_t secs)
+int second_overflow(unsigned int tkid, time64_t secs)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
+ struct ntp_data *ntpdata = &tk_ntp_data[tkid];
s64 delta;
int leap = 0;
s32 rem;
@@ -605,7 +613,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns
*/
static inline bool ntp_synced(void)
{
- return !(tk_ntp_data.time_status & STA_UNSYNC);
+ return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC);
}
/*
@@ -702,7 +710,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k
* reference time to current time.
*/
if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL))
- ntpdata->time_reftime = __ktime_get_real_seconds();
+ ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data);
/* only set allowed bits */
ntpdata->time_status &= STA_RONLY;
@@ -759,10 +767,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct
* adjtimex() mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
-int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
- s32 *time_tai, struct audit_ntp_data *ad)
+int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
+ struct ntp_data *ntpdata = &tk_ntp_data[tkid];
int result;
if (txc->modes & ADJ_ADJTIME) {
@@ -1031,8 +1039,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error)
*/
void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
+ struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE];
struct pps_normtime pts_norm, freq_norm;
- struct ntp_data *ntpdata = &tk_ntp_data;
pts_norm = pps_normalize_ts(*phase_ts);
@@ -1083,18 +1091,18 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t
static int __init ntp_tick_adj_setup(char *str)
{
- int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj);
+ int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj);
if (rc)
return rc;
- tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT;
+ tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT;
return 1;
}
-
__setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
- ntp_clear();
+ for (int id = 0; id < TIMEKEEPERS_MAX; id++)
+ __ntp_clear(tk_ntp_data + id);
ntp_init_cmos_sync();
}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 5a633dce9057..7084d839c207 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -3,14 +3,13 @@
#define _LINUX_NTP_INTERNAL_H
extern void ntp_init(void);
-extern void ntp_clear(void);
+extern void ntp_clear(unsigned int tkid);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
-extern u64 ntp_tick_length(void);
-extern ktime_t ntp_get_next_leap(void);
-extern int second_overflow(time64_t secs);
-extern int __do_adjtimex(struct __kernel_timex *txc,
- const struct timespec64 *ts,
- s32 *time_tai, struct audit_ntp_data *ad);
+extern u64 ntp_tick_length(unsigned int tkid);
+extern ktime_t ntp_get_next_leap(unsigned int tkid);
+extern int second_overflow(unsigned int tkid, time64_t secs);
+extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad);
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 50e8d04ab661..2e5b89d7d866 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void)
lockdep_assert_irqs_disabled();
/*
+ * Ensure that release_task(tsk) can't happen while
+ * handle_posix_cpu_timers() is running. Otherwise, a concurrent
+ * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
+ * miss timer->it.cpu.firing != 0.
+ */
+ if (tsk->exit_state)
+ return;
+
+ /*
* If the actual expiry is deferred to task work context and the
* work is already scheduled there is no point to do anything here.
*/
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 2053b1a4c9e4..8b582174b1f9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -1526,6 +1526,9 @@ static const struct k_clock * const posix_clocks[] = {
[CLOCK_REALTIME_ALARM] = &alarm_clock,
[CLOCK_BOOTTIME_ALARM] = &alarm_clock,
[CLOCK_TAI] = &clock_tai,
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+ [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux,
+#endif
};
static const struct k_clock *clockid_to_kclock(const clockid_t id)
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 61906f0688c1..7f259e845d24 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -41,6 +41,7 @@ extern const struct k_clock clock_posix_dynamic;
extern const struct k_clock clock_process;
extern const struct k_clock clock_thread;
extern const struct k_clock alarm_clock;
+extern const struct k_clock clock_aux;
void posix_timer_queue_signal(struct k_itimer *timr);
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index e6285288d765..3d2a354cfe1c 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -6,7 +6,7 @@
#include <linux/timecounter.h>
void timecounter_init(struct timecounter *tc,
- const struct cyclecounter *cc,
+ struct cyclecounter *cc,
u64 start_tstamp)
{
tc->cc = cc;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a009c91f7b05..059fa8b79be6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -6,6 +6,7 @@
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/kobject.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -25,6 +26,8 @@
#include <linux/audit.h>
#include <linux/random.h>
+#include <vdso/auxclock.h>
+
#include "tick-internal.h"
#include "ntp_internal.h"
#include "timekeeping_internal.h"
@@ -53,7 +56,32 @@ struct tk_data {
raw_spinlock_t lock;
} ____cacheline_aligned;
-static struct tk_data tk_core;
+static struct tk_data timekeeper_data[TIMEKEEPERS_MAX];
+
+/* The core timekeeper */
+#define tk_core (timekeeper_data[TIMEKEEPER_CORE])
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
+{
+ return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts);
+}
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+ return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST;
+}
+#else
+static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
+{
+ return false;
+}
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+ return false;
+}
+#endif
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -113,6 +141,16 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = {
.base[1] = FAST_TK_INIT,
};
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+static __init void tk_aux_setup(void);
+static void tk_aux_update_clocksource(void);
+static void tk_aux_advance(void);
+#else
+static inline void tk_aux_setup(void) { }
+static inline void tk_aux_update_clocksource(void) { }
+static inline void tk_aux_advance(void) { }
+#endif
+
unsigned long timekeeper_lock_irqsave(void)
{
unsigned long flags;
@@ -601,7 +639,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
*/
static inline void tk_update_leap_state(struct timekeeper *tk)
{
- tk->next_leap_ktime = ntp_get_next_leap();
+ tk->next_leap_ktime = ntp_get_next_leap(tk->id);
if (tk->next_leap_ktime != KTIME_MAX)
/* Convert to monotonic time */
tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
@@ -663,7 +701,7 @@ static void timekeeping_restore_shadow(struct tk_data *tkd)
static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
{
- struct timekeeper *tk = &tk_core.shadow_timekeeper;
+ struct timekeeper *tk = &tkd->shadow_timekeeper;
lockdep_assert_held(&tkd->lock);
@@ -678,18 +716,22 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
if (action & TK_CLEAR_NTP) {
tk->ntp_error = 0;
- ntp_clear();
+ ntp_clear(tk->id);
}
tk_update_leap_state(tk);
tk_update_ktime_data(tk);
+ tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
- update_vsyscall(tk);
- update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ if (tk->id == TIMEKEEPER_CORE) {
+ update_vsyscall(tk);
+ update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
- tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
- update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
- update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+ } else if (tk_is_aux(tk)) {
+ vdso_time_update_aux(tk);
+ }
if (action & TK_CLOCK_WAS_SET)
tk->clock_was_set_seq++;
@@ -975,9 +1017,14 @@ time64_t ktime_get_real_seconds(void)
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
/**
- * __ktime_get_real_seconds - The same as ktime_get_real_seconds
- * but without the sequence counter protect. This internal function
- * is called just when timekeeping lock is already held.
+ * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds
+ *
+ * The same as ktime_get_real_seconds() but without the sequence counter
+ * protection. This function is used in restricted contexts like the x86 MCE
+ * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half
+ * completed modification and only to be used for such critical contexts.
+ *
+ * Returns: Racy snapshot of the CLOCK_REALTIME seconds value
*/
noinstr time64_t __ktime_get_real_seconds(void)
{
@@ -1256,7 +1303,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
struct system_time_snapshot *history_begin,
struct system_device_crosststamp *xtstamp)
{
- struct system_counterval_t system_counterval;
+ struct system_counterval_t system_counterval = {};
struct timekeeper *tk = &tk_core.timekeeper;
u64 cycles, now, interval_start;
unsigned int clock_was_set_seq = 0;
@@ -1412,41 +1459,73 @@ int do_settimeofday64(const struct timespec64 *ts)
}
EXPORT_SYMBOL(do_settimeofday64);
+static inline bool timekeeper_is_core_tk(struct timekeeper *tk)
+{
+ return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE;
+}
+
/**
- * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * __timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tkd: Pointer to the timekeeper to modify
* @ts: Pointer to the timespec variable containing the offset
*
* Adds or subtracts an offset value from the current time.
*/
-static int timekeeping_inject_offset(const struct timespec64 *ts)
+static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts)
{
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+ struct timespec64 tmp;
+
if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
- struct timekeeper *tks = &tk_core.shadow_timekeeper;
- struct timespec64 tmp;
-
- timekeeping_forward_now(tks);
+ timekeeping_forward_now(tks);
+ if (timekeeper_is_core_tk(tks)) {
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tks), *ts);
if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
!timespec64_valid_settod(&tmp)) {
- timekeeping_restore_shadow(&tk_core);
+ timekeeping_restore_shadow(tkd);
return -EINVAL;
}
tk_xtime_add(tks, ts);
tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
- timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ } else {
+ struct tk_read_base *tkr_mono = &tks->tkr_mono;
+ ktime_t now, offs;
+
+ /* Get the current time */
+ now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono));
+ /* Add the relative offset change */
+ offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts));
+
+ /* Prevent that the resulting time becomes negative */
+ if (ktime_add(now, offs) < 0) {
+ timekeeping_restore_shadow(tkd);
+ return -EINVAL;
+ }
+ tks->offs_aux = offs;
}
- /* Signal hrtimers about time change */
- clock_was_set(CLOCK_SET_WALL);
+ timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
return 0;
}
+static int timekeeping_inject_offset(const struct timespec64 *ts)
+{
+ int ret;
+
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock)
+ ret = __timekeeping_inject_offset(&tk_core, ts);
+
+ /* Signal hrtimers about time change */
+ if (!ret)
+ clock_was_set(CLOCK_SET_WALL);
+ return ret;
+}
+
/*
* Indicates if there is an offset between the system clock and the hardware
* clock/persistent clock/rtc.
@@ -1522,6 +1601,8 @@ static int change_clocksource(void *data)
timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
}
+ tk_aux_update_clocksource();
+
if (old) {
if (old->disable)
old->disable(old);
@@ -1573,6 +1654,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL(ktime_get_raw_ts64);
+/**
+ * ktime_get_clock_ts64 - Returns time of a clock in a timespec
+ * @id: POSIX clock ID of the clock to read
+ * @ts: Pointer to the timespec64 to be set
+ *
+ * The timestamp is invalidated (@ts->sec is set to -1) if the
+ * clock @id is not available.
+ */
+void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts)
+{
+ /* Invalidate time stamp */
+ ts->tv_sec = -1;
+ ts->tv_nsec = 0;
+
+ switch (id) {
+ case CLOCK_REALTIME:
+ ktime_get_real_ts64(ts);
+ return;
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(ts);
+ return;
+ case CLOCK_MONOTONIC_RAW:
+ ktime_get_raw_ts64(ts);
+ return;
+ case CLOCK_AUX ... CLOCK_AUX_LAST:
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
+ ktime_get_aux_ts64(id, ts);
+ return;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+EXPORT_SYMBOL_GPL(ktime_get_clock_ts64);
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1649,10 +1763,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
*boot_offset = ns_to_timespec64(local_clock());
}
-static __init void tkd_basic_setup(struct tk_data *tkd)
+static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid)
{
raw_spin_lock_init(&tkd->lock);
seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
+ tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
+ tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid;
}
/*
@@ -1682,7 +1798,8 @@ void __init timekeeping_init(void)
struct timekeeper *tks = &tk_core.shadow_timekeeper;
struct clocksource *clock;
- tkd_basic_setup(&tk_core);
+ tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true);
+ tk_aux_setup();
read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
if (timespec64_valid_settod(&wall_time) &&
@@ -2034,7 +2151,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*/
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
- u64 ntp_tl = ntp_tick_length();
+ u64 ntp_tl = ntp_tick_length(tk->id);
u32 mult;
/*
@@ -2115,7 +2232,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
}
/* Figure out if its a leap sec and apply if needed */
- leap = second_overflow(tk->xtime_sec);
+ leap = second_overflow(tk->id, tk->xtime_sec);
if (unlikely(leap)) {
struct timespec64 ts;
@@ -2181,16 +2298,14 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
*/
-static bool timekeeping_advance(enum timekeeping_adv_mode mode)
+static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode)
{
- struct timekeeper *tk = &tk_core.shadow_timekeeper;
- struct timekeeper *real_tk = &tk_core.timekeeper;
+ struct timekeeper *tk = &tkd->shadow_timekeeper;
+ struct timekeeper *real_tk = &tkd->timekeeper;
unsigned int clock_set = 0;
int shift = 0, maxshift;
u64 offset, orig_offset;
- guard(raw_spinlock_irqsave)(&tk_core.lock);
-
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
return false;
@@ -2214,7 +2329,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
shift = ilog2(offset) - ilog2(tk->cycle_interval);
shift = max(0, shift);
/* Bound shift to one less than what overflows tick_length */
- maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
+ maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1;
shift = min(shift, maxshift);
while (offset >= tk->cycle_interval) {
offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
@@ -2239,19 +2354,27 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
if (orig_offset != offset)
tk_update_coarse_nsecs(tk);
- timekeeping_update_from_shadow(&tk_core, clock_set);
+ timekeeping_update_from_shadow(tkd, clock_set);
return !!clock_set;
}
+static bool timekeeping_advance(enum timekeeping_adv_mode mode)
+{
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
+ return __timekeeping_advance(&tk_core, mode);
+}
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
+ * It also updates the enabled auxiliary clock timekeepers
*/
void update_wall_time(void)
{
if (timekeeping_advance(TK_ADV_TICK))
clock_was_set_delayed();
+ tk_aux_advance();
}
/**
@@ -2449,7 +2572,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
/*
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
-static int timekeeping_validate_timex(const struct __kernel_timex *txc)
+static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock)
{
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
@@ -2508,6 +2631,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
return -EINVAL;
}
+ if (aux_clock) {
+ /* Auxiliary clocks are similar to TAI and do not have leap seconds */
+ if (txc->status & (STA_INS | STA_DEL))
+ return -EINVAL;
+
+ /* No TAI offset setting */
+ if (txc->modes & ADJ_TAI)
+ return -EINVAL;
+
+ /* No PPS support either */
+ if (txc->status & (STA_PPSFREQ | STA_PPSTIME))
+ return -EINVAL;
+ }
+
return 0;
}
@@ -2526,74 +2663,103 @@ unsigned long random_get_entropy_fallback(void)
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
-/**
- * do_adjtimex() - Accessor function to NTP __do_adjtimex function
- * @txc: Pointer to kernel_timex structure containing NTP parameters
- */
-int do_adjtimex(struct __kernel_timex *txc)
+struct adjtimex_result {
+ struct audit_ntp_data ad;
+ struct timespec64 delta;
+ bool clock_set;
+};
+
+static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
+ struct adjtimex_result *result)
{
- struct audit_ntp_data ad;
- bool offset_set = false;
- bool clock_set = false;
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+ bool aux_clock = !timekeeper_is_core_tk(tks);
struct timespec64 ts;
+ s32 orig_tai, tai;
int ret;
/* Validate the data before disabling interrupts */
- ret = timekeeping_validate_timex(txc);
+ ret = timekeeping_validate_timex(txc, aux_clock);
if (ret)
return ret;
add_device_randomness(txc, sizeof(*txc));
- if (txc->modes & ADJ_SETOFFSET) {
- struct timespec64 delta;
+ if (!aux_clock)
+ ktime_get_real_ts64(&ts);
+ else
+ tk_get_aux_ts64(tkd->timekeeper.id, &ts);
- delta.tv_sec = txc->time.tv_sec;
- delta.tv_nsec = txc->time.tv_usec;
+ add_device_randomness(&ts, sizeof(ts));
+
+ guard(raw_spinlock_irqsave)(&tkd->lock);
+
+ if (!tks->clock_valid)
+ return -ENODEV;
+
+ if (txc->modes & ADJ_SETOFFSET) {
+ result->delta.tv_sec = txc->time.tv_sec;
+ result->delta.tv_nsec = txc->time.tv_usec;
if (!(txc->modes & ADJ_NANO))
- delta.tv_nsec *= 1000;
- ret = timekeeping_inject_offset(&delta);
+ result->delta.tv_nsec *= 1000;
+ ret = __timekeeping_inject_offset(tkd, &result->delta);
if (ret)
return ret;
-
- offset_set = delta.tv_sec != 0;
- audit_tk_injoffset(delta);
+ result->clock_set = true;
}
- audit_ntp_init(&ad);
+ orig_tai = tai = tks->tai_offset;
+ ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad);
- ktime_get_real_ts64(&ts);
- add_device_randomness(&ts, sizeof(ts));
+ if (tai != orig_tai) {
+ __timekeeping_set_tai_offset(tks, tai);
+ timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
+ result->clock_set = true;
+ } else {
+ tk_update_leap_state_all(&tk_core);
+ }
- scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
- struct timekeeper *tks = &tk_core.shadow_timekeeper;
- s32 orig_tai, tai;
+ /* Update the multiplier immediately if frequency was set directly */
+ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
+ result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ);
- orig_tai = tai = tks->tai_offset;
- ret = __do_adjtimex(txc, &ts, &tai, &ad);
+ return ret;
+}
- if (tai != orig_tai) {
- __timekeeping_set_tai_offset(tks, tai);
- timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
- clock_set = true;
- } else {
- tk_update_leap_state_all(&tk_core);
- }
- }
+/**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ * @txc: Pointer to kernel_timex structure containing NTP parameters
+ */
+int do_adjtimex(struct __kernel_timex *txc)
+{
+ struct adjtimex_result result = { };
+ int ret;
- audit_ntp_log(&ad);
+ ret = __do_adjtimex(&tk_core, txc, &result);
+ if (ret < 0)
+ return ret;
- /* Update the multiplier immediately if frequency was set directly */
- if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
- clock_set |= timekeeping_advance(TK_ADV_FREQ);
+ if (txc->modes & ADJ_SETOFFSET)
+ audit_tk_injoffset(result.delta);
- if (clock_set)
+ audit_ntp_log(&result.ad);
+
+ if (result.clock_set)
clock_was_set(CLOCK_SET_WALL);
- ntp_notify_cmos_timer(offset_set);
+ ntp_notify_cmos_timer(result.delta.tv_sec != 0);
return ret;
}
+/*
+ * Invoked from NTP with the time keeper lock held, so lockless access is
+ * fine.
+ */
+long ktime_get_ntp_seconds(unsigned int id)
+{
+ return timekeeper_data[id].timekeeper.xtime_sec;
+}
+
#ifdef CONFIG_NTP_PPS
/**
* hardpps() - Accessor function to NTP __hardpps function
@@ -2607,3 +2773,316 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+#include "posix-timers.h"
+
+/*
+ * Bitmap for the activated auxiliary timekeepers to allow lockless quick
+ * checks in the hot paths without touching extra cache lines. If set, then
+ * the state of the corresponding timekeeper has to be re-checked under
+ * timekeeper::lock.
+ */
+static unsigned long aux_timekeepers;
+
+static inline unsigned int clockid_to_tkid(unsigned int id)
+{
+ return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX;
+}
+
+static inline struct tk_data *aux_get_tk_data(clockid_t id)
+{
+ if (!clockid_aux_valid(id))
+ return NULL;
+ return &timekeeper_data[clockid_to_tkid(id)];
+}
+
+/* Invoked from timekeeping after a clocksource change */
+static void tk_aux_update_clocksource(void)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ unsigned int id;
+
+ for_each_set_bit(id, &active, BITS_PER_LONG) {
+ struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+
+ guard(raw_spinlock_irqsave)(&tkd->lock);
+ if (!tks->clock_valid)
+ continue;
+
+ timekeeping_forward_now(tks);
+ tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock);
+ timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
+ }
+}
+
+static void tk_aux_advance(void)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ unsigned int id;
+
+ /* Lockless quick check to avoid extra cache lines */
+ for_each_set_bit(id, &active, BITS_PER_LONG) {
+ struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
+
+ guard(raw_spinlock)(&aux_tkd->lock);
+ if (aux_tkd->shadow_timekeeper.clock_valid)
+ __timekeeping_advance(aux_tkd, TK_ADV_TICK);
+ }
+}
+
+/**
+ * ktime_get_aux - Get time for a AUX clock
+ * @id: ID of the clock to read (CLOCK_AUX...)
+ * @kt: Pointer to ktime_t to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux(clockid_t id, ktime_t *kt)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tk;
+ unsigned int seq;
+ ktime_t base;
+ u64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ if (!aux_tkd)
+ return false;
+
+ aux_tk = &aux_tkd->timekeeper;
+ do {
+ seq = read_seqcount_begin(&aux_tkd->seq);
+ if (!aux_tk->clock_valid)
+ return false;
+
+ base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux);
+ nsecs = timekeeping_get_ns(&aux_tk->tkr_mono);
+ } while (read_seqcount_retry(&aux_tkd->seq, seq));
+
+ *kt = ktime_add_ns(base, nsecs);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux);
+
+/**
+ * ktime_get_aux_ts64 - Get time for a AUX clock
+ * @id: ID of the clock to read (CLOCK_AUX...)
+ * @ts: Pointer to timespec64 to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts)
+{
+ ktime_t now;
+
+ if (!ktime_get_aux(id, &now))
+ return false;
+ *ts = ktime_to_timespec64(now);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux_ts64);
+
+static int aux_get_res(clockid_t id, struct timespec64 *tp)
+{
+ if (!clockid_aux_valid(id))
+ return -ENODEV;
+
+ tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC;
+ tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC;
+ return 0;
+}
+
+static int aux_get_timespec(clockid_t id, struct timespec64 *tp)
+{
+ return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV;
+}
+
+static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tks;
+ ktime_t tnow, nsecs;
+
+ if (!timespec64_valid_settod(tnew))
+ return -EINVAL;
+ if (!aux_tkd)
+ return -ENODEV;
+
+ aux_tks = &aux_tkd->shadow_timekeeper;
+
+ guard(raw_spinlock_irq)(&aux_tkd->lock);
+ if (!aux_tks->clock_valid)
+ return -ENODEV;
+
+ /* Forward the timekeeper base time */
+ timekeeping_forward_now(aux_tks);
+ /*
+ * Get the updated base time. tkr_mono.base has not been
+ * updated yet, so do that first. That makes the update
+ * in timekeeping_update_from_shadow() redundant, but
+ * that's harmless. After that @tnow can be calculated
+ * by using tkr_mono::cycle_last, which has been set
+ * by timekeeping_forward_now().
+ */
+ tk_update_ktime_data(aux_tks);
+ nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last);
+ tnow = ktime_add(aux_tks->tkr_mono.base, nsecs);
+
+ /*
+ * Calculate the new AUX offset as delta to @tnow ("monotonic").
+ * That avoids all the tk::xtime back and forth conversions as
+ * xtime ("realtime") is not applicable for auxiliary clocks and
+ * kept in sync with "monotonic".
+ */
+ aux_tks->offs_aux = ktime_sub(timespec64_to_ktime(*tnew), tnow);
+
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+ return 0;
+}
+
+static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct adjtimex_result result = { };
+
+ if (!aux_tkd)
+ return -ENODEV;
+
+ /*
+ * @result is ignored for now as there are neither hrtimers nor a
+ * RTC related to auxiliary clocks for now.
+ */
+ return __do_adjtimex(aux_tkd, txc, &result);
+}
+
+const struct k_clock clock_aux = {
+ .clock_getres = aux_get_res,
+ .clock_get_timespec = aux_get_timespec,
+ .clock_set = aux_clock_set,
+ .clock_adj = aux_clock_adj,
+};
+
+static void aux_clock_enable(clockid_t id)
+{
+ struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw;
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper;
+
+ /* Prevent the core timekeeper from changing. */
+ guard(raw_spinlock_irq)(&tk_core.lock);
+
+ /*
+ * Setup the auxiliary clock assuming that the raw core timekeeper
+ * clock frequency conversion is close enough. Userspace has to
+ * adjust for the deviation via clock_adjtime(2).
+ */
+ guard(raw_spinlock_nested)(&aux_tkd->lock);
+
+ /* Remove leftovers of a previous registration */
+ memset(aux_tks, 0, sizeof(*aux_tks));
+ /* Restore the timekeeper id */
+ aux_tks->id = aux_tkd->timekeeper.id;
+ /* Setup the timekeeper based on the current system clocksource */
+ tk_setup_internals(aux_tks, tkr_raw->clock);
+
+ /* Mark it valid and set it live */
+ aux_tks->clock_valid = true;
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+}
+
+static void aux_clock_disable(clockid_t id)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+
+ guard(raw_spinlock_irq)(&aux_tkd->lock);
+ aux_tkd->shadow_timekeeper.clock_valid = false;
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+}
+
+static DEFINE_MUTEX(aux_clock_mutex);
+
+static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ /* Lazy atoi() as name is "0..7" */
+ int id = kobj->name[0] & 0x7;
+ bool enable;
+
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (kstrtobool(buf, &enable) < 0)
+ return -EINVAL;
+
+ guard(mutex)(&aux_clock_mutex);
+ if (enable == test_bit(id, &aux_timekeepers))
+ return count;
+
+ if (enable) {
+ aux_clock_enable(CLOCK_AUX + id);
+ set_bit(id, &aux_timekeepers);
+ } else {
+ aux_clock_disable(CLOCK_AUX + id);
+ clear_bit(id, &aux_timekeepers);
+ }
+ return count;
+}
+
+static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ /* Lazy atoi() as name is "0..7" */
+ int id = kobj->name[0] & 0x7;
+
+ return sysfs_emit(buf, "%d\n", test_bit(id, &active));
+}
+
+static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable);
+
+static struct attribute *aux_clock_enable_attrs[] = {
+ &aux_clock_enable_attr.attr,
+ NULL
+};
+
+static const struct attribute_group aux_clock_enable_attr_group = {
+ .attrs = aux_clock_enable_attrs,
+};
+
+static int __init tk_aux_sysfs_init(void)
+{
+ struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
+
+ if (!tko)
+ return -ENOMEM;
+
+ auxo = kobject_create_and_add("aux_clocks", tko);
+ if (!auxo) {
+ kobject_put(tko);
+ return -ENOMEM;
+ }
+
+ for (int i = 0; i <= MAX_AUX_CLOCKS; i++) {
+ char id[2] = { [0] = '0' + i, };
+ struct kobject *clk = kobject_create_and_add(id, auxo);
+
+ if (!clk)
+ return -ENOMEM;
+
+ int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
+
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+late_initcall(tk_aux_sysfs_init);
+
+static __init void tk_aux_setup(void)
+{
+ for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++)
+ tkd_basic_setup(&timekeeper_data[i], i, false);
+}
+#endif /* CONFIG_POSIX_AUX_CLOCKS */
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 8c9079108ffb..973ede670a36 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -45,4 +45,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta)
unsigned long timekeeper_lock_irqsave(void);
void timekeeper_unlock_irqrestore(unsigned long flags);
+/* NTP specific interface to access the current seconds value */
+long ktime_get_ntp_seconds(unsigned int id);
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 2f6330831f08..c0c54dc5314c 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -1405,23 +1405,20 @@ u64 tmigr_quick_check(u64 nextevt)
return KTIME_MAX;
do {
- if (!tmigr_check_lonely(group)) {
+ if (!tmigr_check_lonely(group))
return KTIME_MAX;
- } else {
- /*
- * Since current CPU is active, events may not be sorted
- * from bottom to the top because the CPU's event is ignored
- * up to the top and its sibling's events not propagated upwards.
- * Thus keep track of the lowest observed expiry.
- */
- nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
- if (!group->parent)
- return nextevt;
- }
+
+ /*
+ * Since current CPU is active, events may not be sorted
+ * from bottom to the top because the CPU's event is ignored
+ * up to the top and its sibling's events not propagated upwards.
+ * Thus keep track of the lowest observed expiry.
+ */
+ nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
group = group->parent;
} while (group);
- return KTIME_MAX;
+ return nextevt;
}
/*
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 32ef27c71b57..8ba8b0d8a387 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -15,26 +15,25 @@
#include "timekeeping_internal.h"
+static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base)
+{
+ vc->cycle_last = base->cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+ vc->max_cycles = base->clock->max_cycles;
+#endif
+ vc->mask = base->mask;
+ vc->mult = base->mult;
+ vc->shift = base->shift;
+}
+
static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk)
{
struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
u64 nsec, sec;
- vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
-#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles;
-#endif
- vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
- vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
- vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
- vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
-#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles;
-#endif
- vc[CS_RAW].mask = tk->tkr_raw.mask;
- vc[CS_RAW].mult = tk->tkr_raw.mult;
- vc[CS_RAW].shift = tk->tkr_raw.shift;
+ fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono);
+ fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw);
/* CLOCK_MONOTONIC */
vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
@@ -119,7 +118,8 @@ void update_vsyscall(struct timekeeper *tk)
if (clock_mode != VDSO_CLOCKMODE_NONE)
update_vdso_time_data(vdata, tk);
- __arch_update_vsyscall(vdata);
+ __arch_update_vdso_clock(&vc[CS_HRES_COARSE]);
+ __arch_update_vdso_clock(&vc[CS_RAW]);
vdso_write_end(vdata);
@@ -136,6 +136,46 @@ void update_vsyscall_tz(void)
__arch_sync_vdso_time_data(vdata);
}
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+void vdso_time_update_aux(struct timekeeper *tk)
+{
+ struct vdso_time_data *vdata = vdso_k_time_data;
+ struct vdso_timestamp *vdso_ts;
+ struct vdso_clock *vc;
+ s32 clock_mode;
+ u64 nsec;
+
+ vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST];
+ vdso_ts = &vc->basetime[VDSO_BASE_AUX];
+ clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
+ if (!tk->clock_valid)
+ clock_mode = VDSO_CLOCKMODE_NONE;
+
+ /* copy vsyscall data */
+ vdso_write_begin_clock(vc);
+
+ vc->clock_mode = clock_mode;
+
+ if (clock_mode != VDSO_CLOCKMODE_NONE) {
+ fill_clock_configuration(vc, &tk->tkr_mono);
+
+ vdso_ts->sec = tk->xtime_sec;
+
+ nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec += tk->offs_aux;
+ vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
+ nsec = nsec << tk->tkr_mono.shift;
+ vdso_ts->nsec = nsec;
+ }
+
+ __arch_update_vdso_clock(vc);
+
+ vdso_write_end_clock(vc);
+
+ __arch_sync_vdso_time_data(vdata);
+}
+#endif
+
/**
* vdso_update_begin - Start of a VDSO update section
*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3f6a7bdc6edf..47168d2afbf1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1875,6 +1875,29 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
case REQ_OP_READ:
rwbs[i++] = 'R';
break;
+ case REQ_OP_ZONE_APPEND:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'A';
+ break;
+ case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_RESET_ALL:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'R';
+ if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL)
+ rwbs[i++] = 'A';
+ break;
+ case REQ_OP_ZONE_FINISH:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'F';
+ break;
+ case REQ_OP_ZONE_OPEN:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'O';
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ rwbs[i++] = 'Z';
+ rwbs[i++] = 'C';
+ break;
default:
rwbs[i++] = 'N';
}
@@ -1890,6 +1913,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
if (opf & REQ_ATOMIC)
rwbs[i++] = 'U';
+ WARN_ON_ONCE(i >= RWBS_LEN);
+
rwbs[i] = '\0';
}
EXPORT_SYMBOL_GPL(blk_fill_rwbs);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 120531268abf..d01e5c910ce1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3136,7 +3136,10 @@ __register_event(struct trace_event_call *call, struct module *mod)
if (ret < 0)
return ret;
+ down_write(&trace_event_sem);
list_add(&call->list, &ftrace_events);
+ up_write(&trace_event_sem);
+
if (call->flags & TRACE_EVENT_FL_DYNAMIC)
atomic_set(&call->refcnt, 0);
else
@@ -3750,6 +3753,8 @@ __trace_add_event_dirs(struct trace_array *tr)
struct trace_event_call *call;
int ret;
+ lockdep_assert_held(&trace_event_sem);
+
list_for_each_entry(call, &ftrace_events, list) {
ret = __trace_add_new_event(call, tr);
if (ret < 0)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ea8b364b6818..3885aadc434d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1436,15 +1436,6 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
INIT_LIST_HEAD(&head->list);
- item = kmalloc(sizeof(*item), GFP_KERNEL);
- if (!item) {
- kfree(head);
- goto free_now;
- }
-
- item->filter = filter;
- list_add_tail(&item->list, &head->list);
-
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir)
continue;
@@ -1456,6 +1447,13 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
event_clear_filter(file);
}
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item)
+ goto free_now;
+
+ item->filter = filter;
+ list_add_tail(&item->list, &head->list);
+
delay_free_filter(head);
return;
free_now:
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9234e2c39abf..14d74a7491b8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -455,10 +455,16 @@ static int graph_trace_init(struct trace_array *tr)
return 0;
}
+static struct tracer graph_trace;
+
static int ftrace_graph_trace_args(struct trace_array *tr, int set)
{
trace_func_graph_ent_t entry;
+ /* Do nothing if the current tracer is not this tracer */
+ if (tr->current_trace != &graph_trace)
+ return 0;
+
if (set)
entry = trace_graph_entry_args;
else
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 6819b93309ce..fd259da0aa64 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -637,8 +637,8 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u
entry = ring_buffer_event_data(event);
- memcpy(&entry->caller, fstack->calls, size);
entry->size = fstack->nr_entries;
+ memcpy(&entry->caller, fstack->calls, size);
trace_buffer_unlock_commit_nostack(buffer, event);
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 424751cdf31f..40830a3ecd96 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -657,7 +657,7 @@ static int parse_btf_arg(char *varname,
ret = query_btf_context(ctx);
if (ret < 0 || ctx->nr_params == 0) {
trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
- return PTR_ERR(params);
+ return -ENOENT;
}
}
params = ctx->params;
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 75af12ff774e..9c58f5b4381d 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -187,6 +187,28 @@ void watchdog_hardlockup_disable(unsigned int cpu)
}
/**
+ * hardlockup_detector_perf_adjust_period - Adjust the event period due
+ * to current cpu frequency change
+ * @period: The target period to be set
+ */
+void hardlockup_detector_perf_adjust_period(u64 period)
+{
+ struct perf_event *event = this_cpu_read(watchdog_ev);
+
+ if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED))
+ return;
+
+ if (!event)
+ return;
+
+ if (event->attr.sample_period == period)
+ return;
+
+ if (perf_event_period(event, period))
+ pr_err("failed to change period to %llu\n", period);
+}
+
+/**
* hardlockup_detector_perf_stop - Globally stop watchdog events
*
* Special interface for x86 to handle the perf HT bug.
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 97f37b5bae66..9f9148075828 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -7767,7 +7767,8 @@ void __init workqueue_init_early(void)
restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);
-
+ cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();