diff options
Diffstat (limited to 'kernel')
51 files changed, 603 insertions, 309 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index e64ce21f9a80..2ee603a98813 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -134,6 +134,7 @@ config CRASH_DM_CRYPT depends on KEXEC_FILE depends on CRASH_DUMP depends on DM_CRYPT + depends on KEYS help With this option enabled, user space can intereact with /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f2f38903b2fe..b0eae2a3c895 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -668,12 +668,6 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } -static int compare_root(struct vfsmount *mnt, void *arg) -{ - return inode_to_key(d_backing_inode(mnt->mnt_root)) == - (unsigned long)arg; -} - void audit_trim_trees(void) { struct list_head cursor; @@ -683,8 +677,9 @@ void audit_trim_trees(void) while (cursor.next != &tree_list) { struct audit_tree *tree; struct path path; - struct vfsmount *root_mnt; struct audit_node *node; + struct path *paths; + struct path array[16]; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -696,9 +691,9 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(root_mnt)) + if (IS_ERR(paths)) goto skip_it; spin_lock(&hash_lock); @@ -706,14 +701,17 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - if (iterate_mounts(compare_root, - (void *)(chunk->key), - root_mnt)) - node->index &= ~(1U<<31); + for (struct path *p = paths; p->dentry; p++) { + struct inode *inode = p->dentry->d_inode; + if (inode_to_key(inode) == chunk->key) { + node->index &= ~(1U<<31); + break; + } + } } spin_unlock(&hash_lock); trim_marked(tree); - drop_collected_mounts(root_mnt); + drop_collected_paths(paths, array); skip_it: put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -742,9 +740,14 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mount(struct vfsmount *mnt, void *arg) +static int tag_mounts(struct path *paths, struct audit_tree *tree) { - return tag_chunk(d_backing_inode(mnt->mnt_root), arg); + for (struct path *p = paths; p->dentry; p++) { + int err = tag_chunk(p->dentry->d_inode, tree); + if (err) + return err; + } + return 0; } /* @@ -801,7 +804,8 @@ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt; + struct path array[16]; + struct path *paths; int err; rule->tree = NULL; @@ -828,16 +832,16 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); + if (IS_ERR(paths)) { + err = PTR_ERR(paths); goto Err; } get_tree(tree); - err = iterate_mounts(tag_mount, tree, mnt); - drop_collected_mounts(mnt); + err = tag_mounts(paths, tree); + drop_collected_paths(paths, array); if (!err) { struct audit_node *node; @@ -872,20 +876,21 @@ int audit_tag_tree(char *old, char *new) struct list_head cursor, barrier; int failed = 0; struct path path1, path2; - struct vfsmount *tagged; + struct path array[16]; + struct path *paths; int err; err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path2); + paths = collect_paths(&path2, array, 16); path_put(&path2); - if (IS_ERR(tagged)) - return PTR_ERR(tagged); + if (IS_ERR(paths)) + return PTR_ERR(paths); err = kern_path(old, 0, &path1); if (err) { - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return err; } @@ -914,7 +919,7 @@ int audit_tag_tree(char *old, char *new) continue; } - failed = iterate_mounts(tag_mount, tree, tagged); + failed = tag_mounts(paths, tree); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -955,7 +960,7 @@ int audit_tag_tree(char *old, char *new) list_del(&cursor); mutex_unlock(&audit_filter_mutex); path_put(&path1); - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return failed; } diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 3dabdd137d10..2d6e1c98d8ad 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); - if (++nfree == LOCAL_FREE_TARGET) + if (++nfree == lru->target_free) break; } - if (nfree < LOCAL_FREE_TARGET) - __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + if (nfree < lru->target_free) + __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); @@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } + + lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2, + 1, LOCAL_FREE_TARGET); } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index cbd8d3720c2b..fe2661a58ea9 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -58,6 +58,7 @@ struct bpf_lru { del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; + unsigned int target_free; unsigned int nr_scans; bool percpu; }; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9122c39870bf..f4885514f007 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2134,7 +2134,7 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b71e428ad936..ad6df48b540c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -884,6 +884,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (fmt[i] == 'p') { sizeof_cur_arg = sizeof(long); + if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || + ispunct(fmt[i + 1])) { + if (tmp_buf) + cur_arg = raw_args[num_spec]; + goto nocopy_fmt; + } + if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { fmt_ptype = fmt[i + 1]; @@ -891,11 +898,9 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, goto fmt_str; } - if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || - ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || + if (fmt[i + 1] == 'K' || fmt[i + 1] == 'x' || fmt[i + 1] == 's' || fmt[i + 1] == 'S') { - /* just kernel pointers */ if (tmp_buf) cur_arg = raw_args[num_spec]; i++; diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 941d0d2427e3..8e61dc555415 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -21,7 +21,7 @@ static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, { unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT; size_t vm_size = vma->vm_end - vma->vm_start; - phys_addr_t addr = virt_to_phys(__start_BTF); + phys_addr_t addr = __pa_symbol(__start_BTF); unsigned long pfn = addr >> PAGE_SHIFT; if (attr->private != __start_BTF || !PAGE_ALIGNED(addr)) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7d6e0c5928b..169845710c7e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7027,8 +7027,7 @@ BTF_TYPE_SAFE_TRUSTED(struct file) { struct inode *f_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct dentry) { - /* no negative dentry-s in places where bpf can see it */ +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) { struct inode *d_inode; }; @@ -7066,7 +7065,6 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } @@ -7076,6 +7074,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 039d1eb2f215..dd9417425d92 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -66,15 +66,9 @@ static struct freezer *parent_freezer(struct freezer *freezer) bool cgroup_freezing(struct task_struct *task) { bool ret; - unsigned int state; rcu_read_lock(); - /* Check if the cgroup is still FREEZING, but not FROZEN. The extra - * !FROZEN check is required, because the FREEZING bit is not cleared - * when the state FROZEN is reached. - */ - state = task_freezer(task)->state; - ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN); + ret = task_freezer(task)->state & CGROUP_FREEZING; rcu_read_unlock(); return ret; @@ -188,13 +182,12 @@ static void freezer_attach(struct cgroup_taskset *tset) if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; freezer = parent_freezer(freezer); } + freeze_task(task); } } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ce4752ab9e09..cbeaa499a96a 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -47,8 +47,20 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) { - if (ss) + if (ss) { + /* + * Depending on config, the subsystem per-cpu lock type may be an + * empty struct. In enviromnents where this is the case, allocation + * of this field is not performed in ss_rstat_init(). Avoid a + * cpu-based offset relative to NULL by returning early. When the + * lock type is zero in size, the corresponding lock functions are + * no-ops so passing them NULL is acceptable. + */ + if (sizeof(*ss->rstat_ss_cpu_lock) == 0) + return NULL; + return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); + } return per_cpu_ptr(&rstat_base_cpu_lock, cpu); } @@ -510,20 +522,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; -#ifdef CONFIG_SMP /* - * On uniprocessor machines, arch_spinlock_t is defined as an empty - * struct. Avoid allocating a size of zero by having this block - * excluded in this case. It's acceptable to leave the subsystem locks - * unitialized since the associated lock functions are no-ops in the - * non-smp case. + * Depending on config, the subsystem per-cpu lock type may be an empty + * struct. Avoid allocating a size of zero in this case. */ - if (ss) { + if (ss && sizeof(*ss->rstat_ss_cpu_lock)) { ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); if (!ss->rstat_ss_cpu_lock) return -ENOMEM; } -#endif spin_lock_init(ss_rstat_lock(ss)); for_each_possible_cpu(cpu) diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index b753695c5a8f..5dd0f0a34a73 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -2,3 +2,4 @@ CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KERNEL_XZ=y CONFIG_SLUB=y CONFIG_SLUB_TINY=y +CONFIG_LD_DEAD_CODE_DATA_ELIMINATION=y diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 8df0dfaaca18..67af8a55185d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -222,7 +222,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit) if (size_cmdline != -1) { selected_size = size_cmdline; selected_base = base_cmdline; - selected_limit = min_not_zero(limit_cmdline, limit); + + /* Hornor the user setup dma address limit */ + selected_limit = limit_cmdline ?: limit; + if (base_cmdline + size_cmdline == limit_cmdline) fixed = true; } else { diff --git a/kernel/events/core.c b/kernel/events/core.c index f34c99f8ce8f..22fdf0c187cd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -207,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, __perf_ctx_unlock(&cpuctx->ctx); } +typedef struct { + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; +} class_perf_ctx_lock_t; + +static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T) +{ perf_ctx_unlock(_T->cpuctx, _T->ctx); } + +static inline class_perf_ctx_lock_t +class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; } + #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) @@ -938,13 +951,19 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == NULL) return; - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + /* + * Re-check, could've raced vs perf_remove_from_context(). + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; + + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); @@ -962,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task) ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -2120,18 +2138,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) del_event_from_groups(event, ctx); - /* - * If event was in error state, then keep it - * that way, otherwise bogus counts will be - * returned on read(). The only way to get out - * of error state is by explicit re-enabling - * of the event - */ - if (event->state > PERF_EVENT_STATE_OFF) { - perf_cgroup_event_disable(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - } - ctx->generation++; event->pmu_ctx->nr_events--; } @@ -2149,8 +2155,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) } static void put_event(struct perf_event *event); -static void event_sched_out(struct perf_event *event, - struct perf_event_context *ctx); +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state); static void perf_put_aux_event(struct perf_event *event) { @@ -2183,8 +2190,7 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR); } } @@ -2242,18 +2248,6 @@ static inline struct list_head *get_event_list(struct perf_event *event) &event->pmu_ctx->flexible_active; } -/* - * Events that have PERF_EV_CAP_SIBLING require being part of a group and - * cannot exist on their own, schedule them out and move them into the ERROR - * state. Also see _perf_event_enable(), it will not be able to recover - * this ERROR state. - */ -static inline void perf_remove_sibling_event(struct perf_event *event) -{ - event_sched_out(event, event->ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); -} - static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; @@ -2289,8 +2283,15 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + /* + * Events that have PERF_EV_CAP_SIBLING require being part of + * a group and cannot exist on their own, schedule them out + * and move them into the ERROR state. Also see + * _perf_event_enable(), it will not be able to recover this + * ERROR state. + */ if (sibling->event_caps & PERF_EV_CAP_SIBLING) - perf_remove_sibling_event(sibling); + __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2493,11 +2494,14 @@ __perf_remove_from_context(struct perf_event *event, state = PERF_EVENT_STATE_EXIT; if (flags & DETACH_REVOKE) state = PERF_EVENT_STATE_REVOKED; - if (flags & DETACH_DEAD) { - event->pending_disable = 1; + if (flags & DETACH_DEAD) state = PERF_EVENT_STATE_DEAD; - } + event_sched_out(event, ctx); + + if (event->state > PERF_EVENT_STATE_OFF) + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, min(event->state, state)); if (flags & DETACH_GROUP) @@ -2562,6 +2566,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla event_function_call(event, __perf_remove_from_context, (void *)flags); } +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state) +{ + event_sched_out(event, ctx); + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, state); +} + /* * Cross CPU call to disable a performance event */ @@ -2576,13 +2589,18 @@ static void __perf_event_disable(struct perf_event *event, perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); + /* + * When disabling a group leader, the whole group becomes ineligible + * to run, so schedule out the full group. + */ if (event == event->group_leader) group_sched_out(event, ctx); - else - event_sched_out(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - perf_cgroup_event_disable(event, ctx); + /* + * But only mark the leader OFF; the siblings will remain + * INACTIVE. + */ + __event_disable(event, ctx, PERF_EVENT_STATE_OFF); perf_pmu_enable(event->pmu_ctx->pmu); } @@ -2656,8 +2674,8 @@ static void perf_event_unthrottle(struct perf_event *event, bool start) static void perf_event_throttle(struct perf_event *event) { - event->pmu->stop(event, 0); event->hw.interrupts = MAX_INTERRUPTS; + event->pmu->stop(event, 0); if (event == event->group_leader) perf_log_throttle(event, 0); } @@ -7186,18 +7204,18 @@ void perf_event_wakeup(struct perf_event *event) static void perf_sigtrap(struct perf_event *event) { /* - * We'd expect this to only occur if the irq_work is delayed and either - * ctx->task or current has changed in the meantime. This can be the - * case on architectures that do not implement arch_irq_work_raise(). + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. */ - if (WARN_ON_ONCE(event->ctx->task != current)) + if (current->flags & PF_EXITING) return; /* - * Both perf_pending_task() and perf_pending_irq() can race with the - * task exiting. + * We'd expect this to only occur if the irq_work is delayed and either + * ctx->task or current has changed in the meantime. This can be the + * case on architectures that do not implement arch_irq_work_raise(). */ - if (current->flags & PF_EXITING) + if (WARN_ON_ONCE(event->ctx->task != current)) return; send_sig_perf((void __user *)event->pending_addr, @@ -7233,15 +7251,15 @@ static void __perf_pending_disable(struct perf_event *event) * CPU-A CPU-B * * perf_event_disable_inatomic() - * @pending_disable = CPU-A; + * @pending_disable = 1; * irq_work_queue(); * * sched-out - * @pending_disable = -1; + * @pending_disable = 0; * * sched-in * perf_event_disable_inatomic() - * @pending_disable = CPU-B; + * @pending_disable = 1; * irq_work_queue(); // FAILS * * irq_work_run() @@ -7439,6 +7457,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size, if (!regs) return 0; + /* No mm, no stack, no dump. */ + if (!current->mm) + return 0; + /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE @@ -8150,6 +8172,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + if (!current->mm) + user = false; + if (!kernel && !user) return &__empty_callchain; @@ -11091,7 +11116,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!perfmon_capable()) + if (!capable(CAP_SYS_ADMIN)) return -EACCES; /* @@ -11749,7 +11774,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (is_sampling_event(event)) { + /* + * The throttle can be triggered in the hrtimer handler. + * The HRTIMER_NORESTART should be used to stop the timer, + * rather than hrtimer_cancel(). See perf_swevent_hrtimer() + */ + if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); @@ -11804,7 +11834,8 @@ static void cpu_clock_event_start(struct perf_event *event, int flags) static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); + if (flags & PERF_EF_UPDATE) + cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) @@ -11882,7 +11913,8 @@ static void task_clock_event_start(struct perf_event *event, int flags) static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); + if (flags & PERF_EF_UPDATE) + task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index d2aef87c7e9f..aa9a759e824f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -441,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ - event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); WRITE_ONCE(rb->aux_nest, 0); goto err_put; @@ -526,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) - handle->event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); } diff --git a/kernel/exit.c b/kernel/exit.c index bd743900354c..bb184a67ac73 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -940,6 +940,15 @@ void __noreturn do_exit(long code) taskstats_exit(tsk, group_dead); trace_sched_process_exit(tsk, group_dead); + /* + * Since sampling can touch ->mm, make sure to stop everything before we + * tear it down. + * + * Also flushes inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_event_exit_task(tsk); + exit_mm(); if (group_dead) @@ -955,14 +964,6 @@ void __noreturn do_exit(long code) exit_task_work(tsk); exit_thread(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); - sched_autogroup_exit_task(tsk); cgroup_exit(tsk); diff --git a/kernel/freezer.c b/kernel/freezer.c index 8d530d0949ff..6a96149aede9 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -201,18 +201,9 @@ static int __restore_freezer_state(struct task_struct *p, void *arg) void __thaw_task(struct task_struct *p) { - unsigned long flags; - - spin_lock_irqsave(&freezer_lock, flags); - if (WARN_ON_ONCE(freezing(p))) - goto unlock; - - if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL)) - goto unlock; - - wake_up_state(p, TASK_FROZEN); -unlock: - spin_unlock_irqrestore(&freezer_lock, flags); + guard(spinlock_irqsave)(&freezer_lock); + if (frozen(p) && !task_call_func(p, __restore_freezer_state, NULL)) + wake_up_state(p, TASK_FROZEN); } /** diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 565f9717c6ca..90d53fb0ee9e 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -583,8 +583,8 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, if (futex_get_value(&node, naddr)) return -EFAULT; - if (node != FUTEX_NO_NODE && - (node >= MAX_NUMNODES || !node_possible(node))) + if ((node != FUTEX_NO_NODE) && + ((unsigned int)node >= MAX_NUMNODES || !node_possible(node))) return -EINVAL; } @@ -1629,6 +1629,16 @@ again: mm->futex_phash_new = NULL; if (fph) { + if (cur && (!cur->hash_mask || cur->immutable)) { + /* + * If two threads simultaneously request the global + * hash then the first one performs the switch, + * the second one returns here. + */ + free = fph; + mm->futex_phash_new = new; + return -EBUSY; + } if (cur && !new) { /* * If we have an existing hash, but do not yet have diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b0e0a7332993..2b274007e8ba 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -205,6 +205,14 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, void irq_startup_managed(struct irq_desc *desc) { + struct irq_data *d = irq_desc_get_irq_data(desc); + + /* + * Clear managed-shutdown flag, so we don't repeat managed-startup for + * multiple hotplugs, and cause imbalanced disable depth. + */ + irqd_clr_managed_shutdown(d); + /* * Only start it up when the disable depth is 1, so that a disable, * hotunplug, hotplug sequence does not end up enabling it during diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index f07529ae4895..755346ea9819 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -210,13 +210,6 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) return; - /* - * Don't restore suspended interrupts here when a system comes back - * from S3. They are reenabled via resume_device_irqs(). - */ - if (desc->istate & IRQS_SUSPENDED) - return; - if (irqd_is_managed_and_shutdown(data)) irq_startup_managed(desc); diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 1a3d483548e2..ae4c9cbd1b4b 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -202,7 +202,7 @@ struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode, void *data) { struct irq_sim_work_ctx *work_ctx __free(kfree) = - kmalloc(sizeof(*work_ctx), GFP_KERNEL); + kzalloc(sizeof(*work_ctx), GFP_KERNEL); if (!work_ctx) return ERR_PTR(-ENOMEM); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 9c59fa480b0b..3a9a9f240dbc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1136,6 +1136,7 @@ int kernel_kexec(void) Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: + pm_restore_gfp_mask(); console_resume_all(); thaw_processes(); Restore_console: diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 69b953551677..5a21dbe17950 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -164,11 +164,21 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } /* almost as free_reserved_page(), just don't free the page */ -static void kho_restore_page(struct page *page) +static void kho_restore_page(struct page *page, unsigned int order) { - ClearPageReserved(page); - init_page_count(page); - adjust_managed_page_count(page, 1); + unsigned int nr_pages = (1 << order); + + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* For higher order folios, tail pages get a page count of zero. */ + for (unsigned int i = 1; i < nr_pages; i++) + set_page_count(page + i, 0); + + if (order > 0) + prep_compound_page(page, order); + + adjust_managed_page_count(page, nr_pages); } /** @@ -186,15 +196,10 @@ struct folio *kho_restore_folio(phys_addr_t phys) return NULL; order = page->private; - if (order) { - if (order > MAX_PAGE_ORDER) - return NULL; - - prep_compound_page(page, order); - } else { - kho_restore_page(page); - } + if (order > MAX_PAGE_ORDER) + return NULL; + kho_restore_page(page, order); return page_folio(page); } EXPORT_SYMBOL_GPL(kho_restore_folio); diff --git a/kernel/kthread.c b/kernel/kthread.c index 77c44924cf54..85fc068f0083 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1207,7 +1207,8 @@ EXPORT_SYMBOL_GPL(kthread_queue_work); */ void kthread_delayed_work_timer_fn(struct timer_list *t) { - struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); + struct kthread_delayed_work *dwork = timer_container_of(dwork, t, + timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; diff --git a/kernel/module/main.c b/kernel/module/main.c index 3d64e69cc03e..c2c08007029d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -170,6 +170,30 @@ static inline void add_taint_module(struct module *mod, unsigned flag, } /* + * Like strncmp(), except s/-/_/g as per scripts/Makefile.lib:name-fix-token rule. + */ +static int mod_strncmp(const char *str_a, const char *str_b, size_t n) +{ + for (int i = 0; i < n; i++) { + char a = str_a[i]; + char b = str_b[i]; + int d; + + if (a == '-') a = '_'; + if (b == '-') b = '_'; + + d = a - b; + if (d) + return d; + + if (!a) + break; + } + + return 0; +} + +/* * A thread that wants to hold a reference to a module only while it * is running can call this to safely exit. */ @@ -1083,6 +1107,46 @@ static char *get_modinfo(const struct load_info *info, const char *tag) return get_next_modinfo(info, tag, NULL); } +/** + * verify_module_namespace() - does @modname have access to this symbol's @namespace + * @namespace: export symbol namespace + * @modname: module name + * + * If @namespace is prefixed with "module:" to indicate it is a module namespace + * then test if @modname matches any of the comma separated patterns. + * + * The patterns only support tail-glob. + */ +static bool verify_module_namespace(const char *namespace, const char *modname) +{ + size_t len, modlen = strlen(modname); + const char *prefix = "module:"; + const char *sep; + bool glob; + + if (!strstarts(namespace, prefix)) + return false; + + for (namespace += strlen(prefix); *namespace; namespace = sep) { + sep = strchrnul(namespace, ','); + len = sep - namespace; + + glob = false; + if (sep[-1] == '*') { + len--; + glob = true; + } + + if (*sep) + sep++; + + if (mod_strncmp(namespace, modname, len) == 0 && (glob || len == modlen)) + return true; + } + + return false; +} + static int verify_namespace_is_imported(const struct load_info *info, const struct kernel_symbol *sym, struct module *mod) @@ -1092,6 +1156,10 @@ static int verify_namespace_is_imported(const struct load_info *info, namespace = kernel_symbol_namespace(sym); if (namespace && namespace[0]) { + + if (verify_module_namespace(namespace, mod->name)) + return 0; + for_each_modinfo_entry(imported_namespace, info, "import_ns") { if (strcmp(namespace, imported_namespace) == 0) return 0; @@ -1505,8 +1573,14 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (infosec >= info->hdr->e_shnum) continue; - /* Don't bother with non-allocated sections */ - if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) + /* + * Don't bother with non-allocated sections. + * An exception is the percpu section, which has separate allocations + * for individual CPUs. We relocate the percpu section in the initial + * ELF template and subsequently copy it to the per-CPU destinations. + */ + if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC) && + (!infosec || infosec != info->index.pcpu)) continue; if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) @@ -1658,15 +1732,30 @@ static void module_license_taint_check(struct module *mod, const char *license) } } -static void setup_modinfo(struct module *mod, struct load_info *info) +static int setup_modinfo(struct module *mod, struct load_info *info) { const struct module_attribute *attr; + char *imported_namespace; int i; for (i = 0; (attr = modinfo_attrs[i]); i++) { if (attr->setup) attr->setup(mod, get_modinfo(info, attr->attr.name)); } + + for_each_modinfo_entry(imported_namespace, info, "import_ns") { + /* + * 'module:' prefixed namespaces are implicit, disallow + * explicit imports. + */ + if (strstarts(imported_namespace, "module:")) { + pr_err("%s: module tries to import module namespace: %s\n", + mod->name, imported_namespace); + return -EPERM; + } + } + + return 0; } static void free_modinfo(struct module *mod) @@ -2613,9 +2702,8 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { - int i; - enum mod_mem_type t = 0; - int ret = -ENOMEM; + int i, ret; + enum mod_mem_type t = MOD_MEM_NUM_TYPES; bool codetag_section_found = false; for_each_mod_mem_type(type) { @@ -2693,7 +2781,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_err: module_memory_restore_rox(mod); - for (t--; t >= 0; t--) + while (t--) module_memory_free(mod, t); if (codetag_section_found) codetag_free_module_sections(mod); @@ -3323,7 +3411,9 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_unload; /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, info); + err = setup_modinfo(mod, info); + if (err) + goto free_modinfo; /* Fix up syms, so that st_value is a pointer to location. */ err = simplify_symbols(mod, info); @@ -3386,11 +3476,12 @@ static int load_module(struct load_info *info, const char __user *uargs, goto sysfs_cleanup; } + if (codetag_load_module(mod)) + goto sysfs_cleanup; + /* Get rid of temporary copy. */ free_copy(info, flags); - codetag_load_module(mod); - /* Done! */ trace_module_load(mod); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 519fb09de5e0..9216e3b91d3b 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -423,7 +423,6 @@ int hibernation_snapshot(int platform_mode) } console_suspend_all(); - pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -559,7 +558,6 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); console_suspend_all(); - pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); @@ -571,7 +569,6 @@ int hibernation_restore(int platform_mode) BUG_ON(!error); } dpm_resume_end(PMSG_RECOVER); - pm_restore_gfp_mask(); console_resume_all(); pm_restore_console(); return error; diff --git a/kernel/power/power.h b/kernel/power/power.h index cb1d71562002..7ccd709af93f 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -239,11 +239,6 @@ static inline void suspend_test_finish(const char *label) {} /* kernel/power/main.c */ extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); -void pm_restrict_gfp_mask(void); -void pm_restore_gfp_mask(void); -#else -static inline void pm_restrict_gfp_mask(void) {} -static inline void pm_restore_gfp_mask(void) {} #endif #ifdef CONFIG_HIGHMEM diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 76b141b9aac0..b4ca17c2fecf 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -384,6 +384,7 @@ static int suspend_prepare(suspend_state_t state) return 0; dpm_save_failed_step(SUSPEND_FREEZE); + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); Restore: pm_restore_console(); @@ -592,8 +593,6 @@ static int enter_state(suspend_state_t state) ksys_sync_helper(); trace_suspend_resume(TPS("sync_filesystems"), 0, false); } - if (filesystem_freeze_enabled) - filesystems_freeze(); pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); pm_suspend_clear_flags(); @@ -606,16 +605,13 @@ static int enter_state(suspend_state_t state) trace_suspend_resume(TPS("suspend_enter"), state, false); pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]); - pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); - pm_restore_gfp_mask(); Finish: events_check_enabled = false; pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: - filesystems_thaw(); mutex_unlock(&system_transition_mutex); return error; } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c0cc7ae41106..f92443561d36 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -316,7 +316,8 @@ static void call_rcu_tasks_generic_timer(struct timer_list *tlp) unsigned long flags; bool needwake = false; struct rcu_tasks *rtp; - struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer); + struct rcu_tasks_percpu *rtpcp = timer_container_of(rtpcp, tlp, + lazy_timer); rtp = rtpcp->rtpp; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e8a4b720d7d2..14d4499c6fc3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3072,6 +3072,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + /* Avoid NULL dereference if callback is NULL. */ + if (WARN_ON_ONCE(!func)) + return; + if (debug_rcu_head_queue(head)) { /* * Probable double call_rcu(), so leak the callback. diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 1596812f7f12..b473ff056f49 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -985,7 +985,7 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, static void do_nocb_deferred_wakeup_timer(struct timer_list *t) { unsigned long flags; - struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); + struct rcu_data *rdp = timer_container_of(rdp, t, nocb_timer); WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 56b21219442b..486c00536207 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -20,6 +20,28 @@ int sysctl_panic_on_rcu_stall __read_mostly; int sysctl_max_rcu_stall_to_panic __read_mostly; +#ifdef CONFIG_SYSFS + +static unsigned int rcu_stall_count; + +static ssize_t rcu_stall_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", rcu_stall_count); +} + +static struct kobj_attribute rcu_stall_count_attr = __ATTR_RO(rcu_stall_count); + +static __init int kernel_rcu_stall_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &rcu_stall_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_rcu_stall_sysfs_init); + +#endif // CONFIG_SYSFS + #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) #else @@ -784,6 +806,10 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; +#ifdef CONFIG_SYSFS + ++rcu_stall_count; +#endif + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dce50fa57471..81c6df746df1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3362,10 +3362,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { - __schedstat_inc(p->stats.numa_task_swapped); - count_vm_numa_event(NUMA_TASK_SWAP); - count_memcg_event_mm(p->mm, NUMA_TASK_SWAP); - if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; struct rq_flags srf, drf; @@ -3943,6 +3939,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) if (!scx_allow_ttwu_queue(p)) return false; +#ifdef CONFIG_SMP + if (p->sched_class == &stop_sched_class) + return false; +#endif + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. @@ -7663,7 +7664,7 @@ const char *preempt_model_str(void) if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { seq_buf_printf(&s, "(%s)%s", - preempt_dynamic_mode > 0 ? + preempt_dynamic_mode >= 0 ? preempt_modes[preempt_dynamic_mode] : "undef", brace ? "}" : ""); return seq_buf_str(&s); @@ -7934,9 +7935,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; - __schedstat_inc(p->stats.numa_task_migrated); - count_vm_numa_event(NUMA_TASK_MIGRATE); - count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE); + /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } @@ -8545,7 +8545,7 @@ void __init sched_init(void) init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_EXT_GROUP_SCHED - root_task_group.scx_weight = CGROUP_WEIGHT_DFL; + scx_tg_init(&root_task_group); #endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -8985,7 +8985,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; - scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); + scx_tg_init(tg); alloc_uclamp_sched_group(tg, parent); return tg; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ad45a8fea245..89019a140826 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1504,7 +1504,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 if (dl_entity_is_special(dl_se)) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + scaled_delta_exec = delta_exec; + if (!dl_server(dl_se)) + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); dl_se->runtime -= scaled_delta_exec; @@ -1611,7 +1613,7 @@ throttle: */ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) { - s64 delta_exec, scaled_delta_exec; + s64 delta_exec; if (!rq->fair_server.dl_defer) return; @@ -1624,9 +1626,7 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) if (delta_exec < 0) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); - - rq->fair_server.runtime -= scaled_delta_exec; + rq->fair_server.runtime -= delta_exec; if (rq->fair_server.runtime < 0) { rq->fair_server.dl_defer_running = 0; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d71baf08075..557246880a7e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1210,10 +1210,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_failed_migrations_running); P_SCHEDSTAT(nr_failed_migrations_hot); P_SCHEDSTAT(nr_forced_migrations); -#ifdef CONFIG_NUMA_BALANCING - P_SCHEDSTAT(numa_task_migrated); - P_SCHEDSTAT(numa_task_swapped); -#endif P_SCHEDSTAT(nr_wakeups); P_SCHEDSTAT(nr_wakeups_sync); P_SCHEDSTAT(nr_wakeups_migrate); diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2c41c78be61e..7dd5cbcb7a06 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1272,7 +1272,8 @@ static inline struct rq *scx_locked_rq(void) #define SCX_CALL_OP(sch, mask, op, rq, args...) \ do { \ - update_locked_rq(rq); \ + if (rq) \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ (sch)->ops.op(args); \ @@ -1280,14 +1281,16 @@ do { \ } else { \ (sch)->ops.op(args); \ } \ - update_locked_rq(NULL); \ + if (rq) \ + update_locked_rq(NULL); \ } while (0) #define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ ({ \ __typeof__((sch)->ops.op(args)) __ret; \ \ - update_locked_rq(rq); \ + if (rq) \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ __ret = (sch)->ops.op(args); \ @@ -1295,7 +1298,8 @@ do { \ } else { \ __ret = (sch)->ops.op(args); \ } \ - update_locked_rq(NULL); \ + if (rq) \ + update_locked_rq(NULL); \ __ret; \ }) @@ -4092,6 +4096,11 @@ bool scx_can_stop_tick(struct rq *rq) DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); static bool scx_cgroup_enabled; +void scx_tg_init(struct task_group *tg) +{ + tg->scx_weight = CGROUP_WEIGHT_DFL; +} + int scx_tg_online(struct task_group *tg) { struct scx_sched *sch = scx_root; @@ -4241,12 +4250,12 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && tg->scx_weight != weight) { - if (SCX_HAS_OP(sch, cgroup_set_weight)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, - tg_cgrp(tg), weight); - tg->scx_weight = weight; - } + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && + tg->scx_weight != weight) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, + tg_cgrp(tg), weight); + + tg->scx_weight = weight; percpu_up_read(&scx_cgroup_rwsem); } diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 6e5072f57771..a75835c23f15 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -79,6 +79,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} #ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_EXT_GROUP_SCHED +void scx_tg_init(struct task_group *tg); int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); @@ -88,6 +89,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); #else /* CONFIG_EXT_GROUP_SCHED */ +static inline void scx_tg_init(struct task_group *tg) {} static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 66da03cc0b33..001fb88a8481 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -138,6 +138,7 @@ found: goto retry; } +#ifdef CONFIG_NUMA /* * Tracks nodes that have not yet been visited when searching for an idle * CPU across all available nodes. @@ -186,6 +187,13 @@ static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, i return cpu; } +#else +static inline s32 +pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} +#endif /* * Find an idle CPU in the system, starting from @node. @@ -447,11 +455,18 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL; const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr; int node = scx_cpu_node_if_enabled(prev_cpu); + bool is_prev_allowed; s32 cpu; preempt_disable(); /* + * Check whether @prev_cpu is still within the allowed set. If not, + * we can still try selecting a nearby CPU. + */ + is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed); + + /* * Determine the subset of CPUs usable by @p within @cpus_allowed. */ if (allowed != p->cpus_ptr) { @@ -465,21 +480,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, cpu = -EBUSY; goto out_enable; } - - /* - * If @prev_cpu is not in the allowed CPUs, skip topology - * optimizations and try to pick any idle CPU usable by the - * task. - * - * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, prioritize - * the current node, as it may optimize some waker->wakee - * workloads. - */ - if (!cpumask_test_cpu(prev_cpu, allowed)) { - node = scx_cpu_node_if_enabled(smp_processor_id()); - cpu = scx_pick_idle_cpu(allowed, node, flags); - goto out_enable; - } } /* @@ -525,7 +525,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * then avoid a migration. */ cpu = smp_processor_id(); - if (cpus_share_cache(cpu, prev_cpu) && + if (is_prev_allowed && cpus_share_cache(cpu, prev_cpu) && scx_idle_test_and_clear_cpu(prev_cpu)) { cpu = prev_cpu; goto out_unlock; @@ -562,7 +562,8 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, /* * Keep using @prev_cpu if it's part of a fully idle core. */ - if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && + if (is_prev_allowed && + cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) && scx_idle_test_and_clear_cpu(prev_cpu)) { cpu = prev_cpu; goto out_unlock; @@ -611,7 +612,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, /* * Use @prev_cpu if it's idle. */ - if (scx_idle_test_and_clear_cpu(prev_cpu)) { + if (is_prev_allowed && scx_idle_test_and_clear_cpu(prev_cpu)) { cpu = prev_cpu; goto out_unlock; } @@ -902,7 +903,7 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * selection optimizations and simply check whether the previously * used CPU is idle and within the allowed cpumask. */ - if (p->nr_cpus_allowed == 1) { + if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) { if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && scx_idle_test_and_clear_cpu(prev_cpu)) cpu = prev_cpu; diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index c48900b856a2..52ca8e268cfc 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -80,7 +80,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) long nr_active, delta = 0; nr_active = this_rq->nr_running - adjust; - nr_active += (int)this_rq->nr_uninterruptible; + nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1396674fa722..ad04a5c3162a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -733,7 +733,7 @@ static int psi_rtpoll_worker(void *data) static void poll_timer_fn(struct timer_list *t) { - struct psi_group *group = from_timer(group, t, rtpoll_timer); + struct psi_group *group = timer_container_of(group, t, rtpoll_timer); atomic_set(&group->rtpoll_wakeup, 1); wake_up_interruptible(&group->rtpoll_wait); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 475bb5998295..83e3aa917142 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1149,7 +1149,7 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned int nr_uninterruptible; + unsigned long nr_uninterruptible; union { struct task_struct __rcu *donor; /* Scheduler context */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 5d2d0562115b..3fe6b0c99f3d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done) } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work, - struct wake_q_head *wakeq) + struct cpu_stop_work *work) { list_add_tail(&work->list, &stopper->works); - wake_q_add(wakeq, stopper->thread); } /* queue @work to @stopper. if offline, @work is completed immediately */ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - DEFINE_WAKE_Q(wakeq); unsigned long flags; bool enabled; @@ -101,12 +98,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) - __cpu_stop_queue_work(stopper, work, &wakeq); + __cpu_stop_queue_work(stopper, work); else if (work->done) cpu_stop_signal_done(work->done); raw_spin_unlock_irqrestore(&stopper->lock, flags); - wake_up_q(&wakeq); + if (enabled) + wake_up_process(stopper->thread); preempt_enable(); return enabled; @@ -264,7 +262,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, { struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); - DEFINE_WAKE_Q(wakeq); int err; retry: @@ -300,8 +297,8 @@ retry: } err = 0; - __cpu_stop_queue_work(stopper1, work1, &wakeq); - __cpu_stop_queue_work(stopper2, work2, &wakeq); + __cpu_stop_queue_work(stopper1, work1); + __cpu_stop_queue_work(stopper2, work2); unlock: raw_spin_unlock(&stopper2->lock); @@ -316,7 +313,10 @@ unlock: goto retry; } - wake_up_q(&wakeq); + if (!err) { + wake_up_process(stopper1->thread); + wake_up_process(stopper2->thread); + } preempt_enable(); return err; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 50e8d04ab661..2e5b89d7d866 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void) lockdep_assert_irqs_disabled(); /* + * Ensure that release_task(tsk) can't happen while + * handle_posix_cpu_timers() is running. Otherwise, a concurrent + * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and + * miss timer->it.cpu.firing != 0. + */ + if (tsk->exit_state) + return; + + /* * If the actual expiry is deferred to task work context and the * work is already scheduled there is no point to do anything here. */ diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index 5aa38b2cf40a..3c90574bd904 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -22,7 +22,7 @@ struct process_timer { static void process_timeout(struct timer_list *t) { - struct process_timer *timeout = from_timer(timeout, t, timer); + struct process_timer *timeout = timer_container_of(timeout, t, timer); wake_up_process(timeout->task); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1af952cba48d..4203fad56b6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -188,7 +188,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, op->saved_func(ip, parent_ip, op, fregs); } -static void ftrace_sync_ipi(void *data) +void ftrace_sync_ipi(void *data) { /* Probably not needed, but do it anyway */ smp_rmb(); @@ -7438,9 +7438,10 @@ void ftrace_release_mod(struct module *mod) mutex_lock(&ftrace_lock); - if (ftrace_disabled) - goto out_unlock; - + /* + * To avoid the UAF problem after the module is unloaded, the + * 'mod_map' resource needs to be released unconditionally. + */ list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { if (mod_map->mod == mod) { list_del_rcu(&mod_map->list); @@ -7449,6 +7450,9 @@ void ftrace_release_mod(struct module *mod) } } + if (ftrace_disabled) + goto out_unlock; + /* * Each module has its own ftrace_pages, remove * them from the list. @@ -7627,6 +7631,9 @@ allocate_ftrace_mod_map(struct module *mod, { struct ftrace_mod_map *mod_map; + if (ftrace_disabled) + return NULL; + mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL); if (!mod_map) return NULL; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e24509bd0af5..00fc38d70e86 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -6795,7 +6795,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ @@ -6900,7 +6900,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); return 0; @@ -6909,7 +6908,6 @@ error: buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 120531268abf..d01e5c910ce1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3136,7 +3136,10 @@ __register_event(struct trace_event_call *call, struct module *mod) if (ret < 0) return ret; + down_write(&trace_event_sem); list_add(&call->list, &ftrace_events); + up_write(&trace_event_sem); + if (call->flags & TRACE_EVENT_FL_DYNAMIC) atomic_set(&call->refcnt, 0); else @@ -3750,6 +3753,8 @@ __trace_add_event_dirs(struct trace_array *tr) struct trace_event_call *call; int ret; + lockdep_assert_held(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2048560264bb..3885aadc434d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1250,7 +1250,9 @@ static void append_filter_err(struct trace_array *tr, static inline struct event_filter *event_filter(struct trace_event_file *file) { - return file->filter; + return rcu_dereference_protected(file->filter, + lockdep_is_held(&event_mutex)); + } /* caller must hold event_mutex */ @@ -1320,7 +1322,7 @@ void free_event_filter(struct event_filter *filter) static inline void __remove_filter(struct trace_event_file *file) { filter_disable(file); - remove_filter_string(file->filter); + remove_filter_string(event_filter(file)); } static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, @@ -1335,22 +1337,137 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir, } } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + +struct filter_head { + struct list_head list; + struct rcu_head rcu; +}; + + +static void free_filter_list(struct rcu_head *rhp) +{ + struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); + struct filter_list *filter_item, *tmp; + + list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + kfree(filter_list); +} + +static void free_filter_list_tasks(struct rcu_head *rhp) +{ + call_rcu(rhp, free_filter_list); +} + +/* + * The tracepoint_synchronize_unregister() is a double rcu call. + * It calls synchronize_rcu_tasks_trace() followed by synchronize_rcu(). + * Instead of waiting for it, simply call these via the call_rcu*() + * variants. + */ +static void delay_free_filter(struct filter_head *head) +{ + call_rcu_tasks_trace(&head->rcu, free_filter_list_tasks); +} + +static void try_delay_free_filter(struct event_filter *filter) +{ + struct filter_head *head; + struct filter_list *item; + + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) { + kfree(head); + goto free_now; + } + + item->filter = filter; + list_add_tail(&item->list, &head->list); + delay_free_filter(head); + return; + + free_now: + /* Make sure the filter is not being used */ + tracepoint_synchronize_unregister(); + __free_filter(filter); +} + static inline void __free_subsystem_filter(struct trace_event_file *file) { - __free_filter(file->filter); + __free_filter(event_filter(file)); file->filter = NULL; } +static inline void event_set_filter(struct trace_event_file *file, + struct event_filter *filter) +{ + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct trace_event_file *file) +{ + RCU_INIT_POINTER(file->filter, NULL); +} + static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, - struct trace_array *tr) + struct trace_array *tr, + struct event_filter *filter) { struct trace_event_file *file; + struct filter_head *head; + struct filter_list *item; + + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto free_now; + + INIT_LIST_HEAD(&head->list); list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + item->filter = event_filter(file); + list_add_tail(&item->list, &head->list); + event_clear_filter(file); + } + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + + item->filter = filter; + list_add_tail(&item->list, &head->list); + + delay_free_filter(head); + return; + free_now: + tracepoint_synchronize_unregister(); + + if (head) + free_filter_list(&head->rcu); + + list_for_each_entry(file, &tr->events, list) { + if (file->system != dir || !file->filter) + continue; __free_subsystem_filter(file); } + __free_filter(filter); } int filter_assign_type(const char *type) @@ -2120,22 +2237,6 @@ static inline void event_set_filtered_flag(struct trace_event_file *file) trace_buffered_event_enable(); } -static inline void event_set_filter(struct trace_event_file *file, - struct event_filter *filter) -{ - rcu_assign_pointer(file->filter, filter); -} - -static inline void event_clear_filter(struct trace_event_file *file) -{ - RCU_INIT_POINTER(file->filter, NULL); -} - -struct filter_list { - struct list_head list; - struct event_filter *filter; -}; - static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_array *tr, struct filter_parse_error *pe, @@ -2144,11 +2245,16 @@ static int process_system_preds(struct trace_subsystem_dir *dir, struct trace_event_file *file; struct filter_list *filter_item; struct event_filter *filter = NULL; - struct filter_list *tmp; - LIST_HEAD(filter_list); + struct filter_head *filter_list; bool fail = true; int err; + filter_list = kmalloc(sizeof(*filter_list), GFP_KERNEL); + if (!filter_list) + return -ENOMEM; + + INIT_LIST_HEAD(&filter_list->list); + list_for_each_entry(file, &tr->events, list) { if (file->system != dir) @@ -2175,7 +2281,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (!filter_item) goto fail_mem; - list_add_tail(&filter_item->list, &filter_list); + list_add_tail(&filter_item->list, &filter_list->list); /* * Regardless of if this returned an error, we still * replace the filter for the call. @@ -2195,31 +2301,22 @@ static int process_system_preds(struct trace_subsystem_dir *dir, * Do a synchronize_rcu() and to ensure all calls are * done with them before we free them. */ - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); return 0; fail: /* No call succeeded */ - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - list_del(&filter_item->list); - kfree(filter_item); - } + free_filter_list(&filter_list->rcu); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: __free_filter(filter); + /* If any call succeeded, we still need to sync */ if (!fail) - tracepoint_synchronize_unregister(); - list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { - __free_filter(filter_item->filter); - list_del(&filter_item->list); - kfree(filter_item); - } + delay_free_filter(filter_list); + else + free_filter_list(&filter_list->rcu); + return -ENOMEM; } @@ -2361,9 +2458,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_clear_filter(file); - /* Make sure the filter is not being used */ - tracepoint_synchronize_unregister(); - __free_filter(filter); + try_delay_free_filter(filter); return 0; } @@ -2387,11 +2482,8 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) event_set_filter(file, filter); - if (tmp) { - /* Make sure the call is done with the filter */ - tracepoint_synchronize_unregister(); - __free_filter(tmp); - } + if (tmp) + try_delay_free_filter(tmp); } return err; @@ -2417,9 +2509,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ - tracepoint_synchronize_unregister(); - filter_free_subsystem_filters(dir, tr); - __free_filter(filter); + filter_free_subsystem_filters(dir, tr, filter); return 0; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 9234e2c39abf..14d74a7491b8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -455,10 +455,16 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +static struct tracer graph_trace; + static int ftrace_graph_trace_args(struct trace_array *tr, int set) { trace_func_graph_ent_t entry; + /* Do nothing if the current tracer is not this tracer */ + if (tr->current_trace != &graph_trace) + return 0; + if (set) entry = trace_graph_entry_args; else diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 6819b93309ce..fd259da0aa64 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -637,8 +637,8 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u entry = ring_buffer_event_data(event); - memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; + memcpy(&entry->caller, fstack->calls, size); trace_buffer_unlock_commit_nostack(buffer, event); } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 424751cdf31f..40830a3ecd96 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -657,7 +657,7 @@ static int parse_btf_arg(char *varname, ret = query_btf_context(ctx); if (ret < 0 || ctx->nr_params == 0) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); - return PTR_ERR(params); + return -ENOENT; } } params = ctx->params; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3bef0754cf73..9f9148075828 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2481,7 +2481,7 @@ EXPORT_SYMBOL_GPL(queue_work_node); void delayed_work_timer_fn(struct timer_list *t) { - struct delayed_work *dwork = from_timer(dwork, t, timer); + struct delayed_work *dwork = timer_container_of(dwork, t, timer); /* should have been called from irqsafe timer with irq already off */ __queue_work(dwork->cpu, dwork->wq, &dwork->work); @@ -2909,7 +2909,7 @@ static void set_worker_dying(struct worker *worker, struct list_head *list) */ static void idle_worker_timeout(struct timer_list *t) { - struct worker_pool *pool = from_timer(pool, t, idle_timer); + struct worker_pool *pool = timer_container_of(pool, t, idle_timer); bool do_cull = false; if (work_pending(&pool->idle_cull_work)) @@ -3008,7 +3008,7 @@ static void send_mayday(struct work_struct *work) static void pool_mayday_timeout(struct timer_list *t) { - struct worker_pool *pool = from_timer(pool, t, mayday_timer); + struct worker_pool *pool = timer_container_of(pool, t, mayday_timer); struct work_struct *work; raw_spin_lock_irq(&pool->lock); @@ -7767,7 +7767,8 @@ void __init workqueue_init_early(void) restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); - + cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, + housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); |