diff options
Diffstat (limited to 'kernel')
78 files changed, 1990 insertions, 787 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index daad787fb795..f0902a7bd1b3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -115,6 +115,8 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o +obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o + obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1f31c2f1e6fc..4508d5e0cf69 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -351,12 +351,12 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ watch->dev = d->d_sb->s_dev; watch->ino = d_backing_inode(d)->i_ino; } + inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ddd8addcdb5c..a3eaf08e7dd3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1311,12 +1311,12 @@ static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, return false; switch (off) { - case offsetof(struct bpf_sysctl, write): + case bpf_ctx_range(struct bpf_sysctl, write): if (type != BPF_READ) return false; bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); - case offsetof(struct bpf_sysctl, file_pos): + case bpf_ctx_range(struct bpf_sysctl, file_pos): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 66088a9e9b9e..ef0e1e3e66f4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -502,7 +502,7 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); } -void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) +static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d27f3b60ff6d..3867864cdc2f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -128,7 +128,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; - cost += sizeof(struct hlist_head) * dtab->n_buckets; + cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; } /* if map size is larger than memlock limit, reject it */ @@ -719,6 +719,32 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_check_btf = map_check_no_btf, }; +static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, + struct net_device *netdev) +{ + unsigned long flags; + u32 i; + + spin_lock_irqsave(&dtab->index_lock, flags); + for (i = 0; i < dtab->n_buckets; i++) { + struct bpf_dtab_netdev *dev; + struct hlist_head *head; + struct hlist_node *next; + + head = dev_map_index_hash(dtab, i); + + hlist_for_each_entry_safe(dev, next, head, index_hlist) { + if (netdev != dev->dev) + continue; + + dtab->items--; + hlist_del_rcu(&dev->index_hlist); + call_rcu(&dev->rcu, __dev_map_entry_free); + } + } + spin_unlock_irqrestore(&dtab->index_lock, flags); +} + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { @@ -735,6 +761,11 @@ static int dev_map_notification(struct notifier_block *notifier, */ rcu_read_lock(); list_for_each_entry_rcu(dtab, &dev_map_list, list) { + if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dev_map_hash_remove_netdev(dtab, netdev); + continue; + } + for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5e28718928ca..cada974c9f4e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -317,7 +317,7 @@ BPF_CALL_0(bpf_get_current_cgroup_id) { struct cgroup *cgrp = task_dfl_cgroup(current); - return cgrp->kn->id.id; + return cgroup_id(cgrp); } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index addd6fdceec8..2ba750725cb2 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -569,7 +569,7 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, return; storage->key.attach_type = type; - storage->key.cgroup_inode_id = cgroup->kn->id.id; + storage->key.cgroup_inode_id = cgroup_id(cgroup); map = storage->map; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index ba635209ae9a..5b9da0954a27 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -678,8 +678,10 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) down_write(&bpf_devs_lock); if (!offdevs_inited) { err = rhashtable_init(&offdevs, &offdevs_params); - if (err) + if (err) { + up_write(&bpf_devs_lock); return ERR_PTR(err); + } offdevs_inited = true; } up_write(&bpf_devs_lock); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 82eabd4e38ad..ace1cfaa24b6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -126,7 +126,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -void *bpf_map_area_alloc(size_t size, int numa_node) +void *bpf_map_area_alloc(u64 size, int numa_node) { /* We really just want to fail instead of triggering OOM killer * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, @@ -141,6 +141,9 @@ void *bpf_map_area_alloc(size_t size, int numa_node) const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; + if (size >= SIZE_MAX) + return NULL; + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, numa_node); @@ -197,7 +200,7 @@ static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) atomic_long_sub(pages, &user->locked_vm); } -int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) +int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size) { u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; struct user_struct *user; @@ -1326,24 +1329,32 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) { struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); + kvfree(aux->func_info); free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); security_bpf_prog_free(aux); bpf_prog_free(aux->prog); } +static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) +{ + bpf_prog_kallsyms_del_all(prog); + btf_put(prog->aux->btf); + bpf_prog_free_linfo(prog); + + if (deferred) + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + else + __bpf_prog_put_rcu(&prog->aux->rcu); +} + static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); - bpf_prog_kallsyms_del_all(prog); - btf_put(prog->aux->btf); - kvfree(prog->aux->func_info); - bpf_prog_free_linfo(prog); - - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + __bpf_prog_put_noref(prog, true); } } @@ -1741,11 +1752,12 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) return err; free_used_maps: - bpf_prog_free_linfo(prog); - kvfree(prog->aux->func_info); - btf_put(prog->aux->btf); - bpf_prog_kallsyms_del_subprogs(prog); - free_used_maps(prog->aux); + /* In case we have subprogs, we need to wait for a grace + * period before we can tear down JIT memory since symbols + * are already exposed under kallsyms. + */ + __bpf_prog_put_noref(prog, prog->aux->func_cnt); + return err; free_prog: bpf_prog_uncharge_memlock(prog); free_prog_sec: diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 809e34a3c017..90d1710fef6c 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -231,9 +231,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 7f83f4121d8d..09f3a413f6f8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -495,12 +495,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; + bool locked; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup); + task = cgroup_procs_write_start(buf, threadgroup, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -522,7 +523,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 080561bb8a4b..735af8f15f95 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -899,8 +899,7 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit() changing the css_set to - * init_css_set and dropping the old one. + * against cgroup_exit()/cgroup_free() dropping the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -1309,10 +1308,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root) void cgroup_free_root(struct cgroup_root *root) { - if (root) { - idr_destroy(&root->cgroup_idr); - kfree(root); - } + kfree(root); } static void cgroup_destroy_root(struct cgroup_root *root) @@ -1374,6 +1370,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) cset = current->nsproxy->cgroup_ns->root_cset; if (cset == &init_css_set) { res = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; @@ -1430,9 +1428,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root) { /* - * No need to lock the task - since we hold cgroup_mutex the - * task can't change groups, so the only thing that can happen - * is that it exits and its css is set back to init_css_set. + * No need to lock the task - since we hold css_set_lock the + * task can't change groups. */ return cset_cgroup_from_root(task_css_set(task), root); } @@ -1883,65 +1880,6 @@ static int cgroup_reconfigure(struct fs_context *fc) return 0; } -/* - * To reduce the fork() overhead for systems that are not actually using - * their cgroups capability, we don't maintain the lists running through - * each css_set to its tasks until we see the list actually used - in other - * words after the first mount. - */ -static bool use_task_css_set_links __read_mostly; - -void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - - /* - * We need tasklist_lock because RCU is not safe against - * while_each_thread(). Besides, a forking task that has passed - * cgroup_post_fork() without seeing use_task_css_set_links = 1 - * is not guaranteed to have its child immediately visible in the - * tasklist if we walk through it with RCU. - */ - read_lock(&tasklist_lock); - spin_lock_irq(&css_set_lock); - - if (use_task_css_set_links) - goto out_unlock; - - use_task_css_set_links = true; - - do_each_thread(g, p) { - WARN_ON_ONCE(!list_empty(&p->cg_list) || - task_css_set(p) != &init_css_set); - - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - * Do it while holding siglock so that we don't end up - * racing against cgroup_exit(). - * - * Interrupts were already disabled while acquiring - * the css_set_lock, so we do not need to disable it - * again when acquiring the sighand->siglock here. - */ - spin_lock(&p->sighand->siglock); - if (!(p->flags & PF_EXITING)) { - struct css_set *cset = task_css_set(p); - - if (!css_set_populated(cset)) - css_set_update_populated(cset, true); - list_add_tail(&p->cg_list, &cset->tasks); - get_css_set(cset); - cset->nr_tasks++; - } - spin_unlock(&p->sighand->siglock); - } while_each_thread(g, p); -out_unlock: - spin_unlock_irq(&css_set_lock); - read_unlock(&tasklist_lock); -} - static void init_cgroup_housekeeping(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -1976,7 +1914,6 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) atomic_set(&root->nr_cgrps, 1); cgrp->root = root; init_cgroup_housekeeping(cgrp); - idr_init(&root->cgroup_idr); root->flags = ctx->flags; if (ctx->release_agent) @@ -1997,12 +1934,6 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) lockdep_assert_held(&cgroup_mutex); - ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL); - if (ret < 0) - goto out; - root_cgrp->id = ret; - root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -2035,6 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto exit_root_id; } root_cgrp->kn = root->kf_root->kn; + WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); + root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); ret = css_populate_dir(&root_cgrp->self); if (ret) @@ -2119,11 +2052,12 @@ int cgroup_do_get_tree(struct fs_context *fc) nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(fc->root); - fc->root = nsdentry; if (IS_ERR(nsdentry)) { - ret = PTR_ERR(nsdentry); deactivate_locked_super(sb); + ret = PTR_ERR(nsdentry); + nsdentry = NULL; } + fc->root = nsdentry; } if (!ctx->kfc.new_sb_created) @@ -2187,13 +2121,6 @@ static int cgroup_init_fs_context(struct fs_context *fc) if (!ctx) return -ENOMEM; - /* - * The first time anyone tries to mount a cgroup, enable the list - * linking each css_set to its tasks and fix up all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); - ctx->ns = current->nsproxy->cgroup_ns; get_cgroup_ns(ctx->ns); fc->fs_private = &ctx->kfc; @@ -2371,9 +2298,8 @@ static void cgroup_migrate_add_task(struct task_struct *task, if (task->flags & PF_EXITING) return; - /* leave @task alone if post_fork() hasn't linked it yet */ - if (list_empty(&task->cg_list)) - return; + /* cgroup_threadgroup_rwsem protects racing against forks */ + WARN_ON_ONCE(list_empty(&task->cg_list)); cset = task_css_set(task); if (!cset->mg_src_cgrp) @@ -2824,7 +2750,8 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, return ret; } -struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) +struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, + bool *locked) __acquires(&cgroup_threadgroup_rwsem) { struct task_struct *tsk; @@ -2833,7 +2760,21 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return ERR_PTR(-EINVAL); - percpu_down_write(&cgroup_threadgroup_rwsem); + /* + * If we migrate a single thread, we don't care about threadgroup + * stability. If the thread is `current`, it won't exit(2) under our + * hands or change PID through exec(2). We exclude + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write + * callers by cgroup_mutex. + * Therefore, we can skip the global lock. + */ + lockdep_assert_held(&cgroup_mutex); + if (pid || threadgroup) { + percpu_down_write(&cgroup_threadgroup_rwsem); + *locked = true; + } else { + *locked = false; + } rcu_read_lock(); if (pid) { @@ -2864,13 +2805,16 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup) goto out_unlock_rcu; out_unlock_threadgroup: - percpu_up_write(&cgroup_threadgroup_rwsem); + if (*locked) { + percpu_up_write(&cgroup_threadgroup_rwsem); + *locked = false; + } out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task) +void cgroup_procs_write_finish(struct task_struct *task, bool locked) __releases(&cgroup_threadgroup_rwsem) { struct cgroup_subsys *ss; @@ -2879,7 +2823,8 @@ void cgroup_procs_write_finish(struct task_struct *task) /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - percpu_up_write(&cgroup_threadgroup_rwsem); + if (locked) + percpu_up_write(&cgroup_threadgroup_rwsem); for_each_subsys(ss, ssid) if (ss->post_attach) ss->post_attach(); @@ -3600,22 +3545,22 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - struct cgroup *cgroup = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_id(cgrp) == 1 ? &psi_system : &cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -4567,9 +4512,6 @@ repeat: void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it) { - /* no one should try to iterate before mounting cgroups */ - WARN_ON_ONCE(!use_task_css_set_links); - memset(it, 0, sizeof(*it)); spin_lock_irq(&css_set_lock); @@ -4754,12 +4696,13 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, true); + task = cgroup_procs_write_start(buf, true, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4777,7 +4720,7 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, true); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); @@ -4795,6 +4738,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; ssize_t ret; + bool locked; buf = strstrip(buf); @@ -4802,7 +4746,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, false); + task = cgroup_procs_write_start(buf, false, &locked); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -4826,7 +4770,7 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: - cgroup_procs_write_finish(task); + cgroup_procs_write_finish(task, locked); out_unlock: cgroup_kn_unlock(of->kn); @@ -5036,9 +4980,6 @@ static void css_release_work_fn(struct work_struct *work) tcgrp->nr_dying_descendants--; spin_unlock_irq(&css_set_lock); - cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - cgrp->id = -1; - /* * There are two control paths which try to determine * cgroup from dentry without going through kernfs - @@ -5203,10 +5144,12 @@ err_free_css: * it isn't associated with its kernfs_node and doesn't have the control * mask applied. */ -static struct cgroup *cgroup_create(struct cgroup *parent) +static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, + umode_t mode) { struct cgroup_root *root = parent->root; struct cgroup *cgrp, *tcgrp; + struct kernfs_node *kn; int level = parent->level + 1; int ret; @@ -5226,15 +5169,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_cancel_ref; } - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); - if (cgrp->id < 0) { - ret = -ENOMEM; + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); goto out_stat_exit; } + cgrp->kn = kn; init_cgroup_housekeeping(cgrp); @@ -5244,7 +5185,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) ret = psi_cgroup_alloc(cgrp); if (ret) - goto out_idr_free; + goto out_kernfs_remove; ret = cgroup_bpf_inherit(cgrp); if (ret) @@ -5268,7 +5209,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; + cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); if (tcgrp != cgrp) { tcgrp->nr_descendants++; @@ -5298,12 +5239,6 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgroup_get_live(parent); /* - * @cgrp is now fully operational. If something fails after this - * point, it'll be released via the normal destruction path. - */ - cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - - /* * On the default hierarchy, a child doesn't automatically inherit * subtree_control from the parent. Each is configured manually. */ @@ -5316,8 +5251,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) out_psi_free: psi_cgroup_free(cgrp); -out_idr_free: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); +out_kernfs_remove: + kernfs_remove(cgrp->kn); out_stat_exit: if (cgroup_on_dfl(parent)) cgroup_rstat_exit(cgrp); @@ -5354,7 +5289,6 @@ fail: int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { struct cgroup *parent, *cgrp; - struct kernfs_node *kn; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ @@ -5370,27 +5304,19 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) goto out_unlock; } - cgrp = cgroup_create(parent); + cgrp = cgroup_create(parent, name, mode); if (IS_ERR(cgrp)) { ret = PTR_ERR(cgrp); goto out_unlock; } - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_destroy; - } - cgrp->kn = kn; - /* * This extra ref will be put in cgroup_free_fn() and guarantees * that @cgrp->kn is always accessible. */ - kernfs_get(kn); + kernfs_get(cgrp->kn); - ret = cgroup_kn_set_ugid(kn); + ret = cgroup_kn_set_ugid(cgrp->kn); if (ret) goto out_destroy; @@ -5405,7 +5331,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) TRACE_CGROUP_PATH(mkdir, cgrp); /* let's create and online css's */ - kernfs_activate(kn); + kernfs_activate(cgrp->kn); ret = 0; goto out_unlock; @@ -5835,12 +5761,11 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); -void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, - char *buf, size_t buflen) +void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) { struct kernfs_node *kn; - kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) return; kernfs_path(kn, buf, buflen); @@ -6001,62 +5926,38 @@ void cgroup_cancel_fork(struct task_struct *child) void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; + struct css_set *cset; int i; + spin_lock_irq(&css_set_lock); + + WARN_ON_ONCE(!list_empty(&child->cg_list)); + cset = task_css_set(current); /* current is @child's parent */ + get_css_set(cset); + cset->nr_tasks++; + css_set_move_task(child, NULL, cset, false); + /* - * This may race against cgroup_enable_task_cg_lists(). As that - * function sets use_task_css_set_links before grabbing - * tasklist_lock and we just went through tasklist_lock to add - * @child, it's guaranteed that either we see the set - * use_task_css_set_links or cgroup_enable_task_cg_lists() sees - * @child during its iteration. - * - * If we won the race, @child is associated with %current's - * css_set. Grabbing css_set_lock guarantees both that the - * association is stable, and, on completion of the parent's - * migration, @child is visible in the source of migration or - * already in the destination cgroup. This guarantee is necessary - * when implementing operations which need to migrate all tasks of - * a cgroup to another. - * - * Note that if we lose to cgroup_enable_task_cg_lists(), @child - * will remain in init_css_set. This is safe because all tasks are - * in the init_css_set before cg_links is enabled and there's no - * operation which transfers all tasks out of init_css_set. + * If the cgroup has to be frozen, the new task has too. Let's set + * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the + * frozen state. */ - if (use_task_css_set_links) { - struct css_set *cset; - - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); - if (list_empty(&child->cg_list)) { - get_css_set(cset); - cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); - } + if (unlikely(cgroup_task_freeze(child))) { + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); /* - * If the cgroup has to be frozen, the new task has too. - * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get - * the task into the frozen state. + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later from + * do_freezer_trap(). So we avoid cgroup's transient switch + * from the frozen state and back. */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); - - /* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later - * from do_freezer_trap(). So we avoid cgroup's - * transient switch from the frozen state and back. - */ - } - - spin_unlock_irq(&css_set_lock); } + spin_unlock_irq(&css_set_lock); + /* * Call ss->fork(). This must happen after @child is linked on * css_set; otherwise, @child might change state between ->fork() @@ -6071,20 +5972,8 @@ void cgroup_post_fork(struct task_struct *child) * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * - * Description: Detach cgroup from @tsk and release it. - * - * Note that cgroups marked notify_on_release force every task in - * them to take the global cgroup_mutex mutex when exiting. - * This could impact scaling on very large systems. Be reluctant to - * use notify_on_release cgroups where very high task exit scaling - * is required on large systems. + * Description: Detach cgroup from @tsk. * - * We set the exiting tasks cgroup to the root cgroup (top_cgroup). We - * call cgroup_exit() while the task is still competent to handle - * notify_on_release(), then leave the task attached to the root cgroup in - * each hierarchy for the remainder of its exit. No need to bother with - * init_css_set refcnting. init_css_set never goes away and we can't race - * with migration path - PF_EXITING is visible to migration path. */ void cgroup_exit(struct task_struct *tsk) { @@ -6092,26 +5981,19 @@ void cgroup_exit(struct task_struct *tsk) struct css_set *cset; int i; - /* - * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check css_set and cg_list without synchronization. - */ - cset = task_css_set(tsk); + spin_lock_irq(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - spin_lock_irq(&css_set_lock); - css_set_move_task(tsk, cset, NULL, false); - list_add_tail(&tsk->cg_list, &cset->dying_tasks); - cset->nr_tasks--; + WARN_ON_ONCE(list_empty(&tsk->cg_list)); + cset = task_css_set(tsk); + css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); + cset->nr_tasks--; - WARN_ON_ONCE(cgroup_task_frozen(tsk)); - if (unlikely(cgroup_task_freeze(tsk))) - cgroup_update_frozen(task_dfl_cgroup(tsk)); + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); - } else { - get_css_set(cset); - } + spin_unlock_irq(&css_set_lock); /* see cgroup_post_fork() for details */ do_each_subsys_mask(ss, i, have_exit_callback) { @@ -6128,12 +6010,10 @@ void cgroup_release(struct task_struct *task) ss->release(task); } while_each_subsys_mask(); - if (use_task_css_set_links) { - spin_lock_irq(&css_set_lock); - css_set_skip_task_iters(task_css_set(task), task); - list_del_init(&task->cg_list); - spin_unlock_irq(&css_set_lock); - } + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); } void cgroup_free(struct task_struct *task) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c52bc91f882b..58f5073acff7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -798,7 +798,8 @@ static int generate_sched_domains(cpumask_var_t **domains, cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) continue; - if (is_sched_load_balance(cp)) + if (is_sched_load_balance(cp) && + !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; /* skip @cp's subtree if not a partition root */ @@ -928,8 +929,6 @@ static void rebuild_root_domains(void) lockdep_assert_cpus_held(); lockdep_assert_held(&sched_domains_mutex); - cgroup_enable_task_cg_lists(); - rcu_read_lock(); /* diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 8cf010680678..3984dd6b8ddb 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -231,6 +231,15 @@ void cgroup_freezer_migrate_task(struct task_struct *task, return; /* + * It's not necessary to do changes if both of the src and dst cgroups + * are not freezing and task is not frozen. + */ + if (!test_bit(CGRP_FREEZE, &src->flags) && + !test_bit(CGRP_FREEZE, &dst->flags) && + !task->frozen) + return; + + /* * Adjust counters of freezing and frozen tasks. * Note, that if the task is frozen, but the destination cgroup is not * frozen, we bump both counters to keep them balanced. diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 8e513a573fe9..138059eb730d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -45,7 +45,7 @@ struct pids_cgroup { * %PIDS_MAX = (%PID_MAX_LIMIT + 1). */ atomic64_t counter; - int64_t limit; + atomic64_t limit; /* Handle for "pids.events" */ struct cgroup_file events_file; @@ -73,8 +73,8 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - pids->limit = PIDS_MAX; atomic64_set(&pids->counter, 0); + atomic64_set(&pids->limit, PIDS_MAX); atomic64_set(&pids->events_limit, 0); return &pids->css; } @@ -146,13 +146,14 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); + int64_t limit = atomic64_read(&p->limit); /* * Since new is capped to the maximum number of pid_t, if * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > p->limit) + if (new > limit) goto revert; } @@ -277,7 +278,7 @@ set_limit: * Limit updates don't need to be mutex'd, since it isn't * critical that any racing fork()s follow the new limit. */ - pids->limit = limit; + atomic64_set(&pids->limit, limit); return nbytes; } @@ -285,7 +286,7 @@ static int pids_max_show(struct seq_file *sf, void *v) { struct cgroup_subsys_state *css = seq_css(sf); struct pids_cgroup *pids = css_pids(css); - int64_t limit = pids->limit; + int64_t limit = atomic64_read(&pids->limit); if (limit >= PIDS_MAX) seq_printf(sf, "%s\n", PIDS_MAX_STR); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ca19b4c8acf5..b48b22d4deb6 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -304,44 +304,48 @@ void __init cgroup_rstat_boot(void) * Functions for cgroup basic resource statistics implemented on top of * rstat. */ -static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, - struct cgroup_base_stat *src_bstat) +static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; } +static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) +{ + dst_bstat->cputime.utime -= src_bstat->cputime.utime; + dst_bstat->cputime.stime -= src_bstat->cputime.stime; + dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; +} + static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; - struct task_cputime cputime; - struct cgroup_base_stat delta; + struct cgroup_base_stat cur, delta; unsigned seq; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cputime = rstatc->bstat.cputime; + cur.cputime = rstatc->bstat.cputime; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* calculate the delta to propgate */ - delta.cputime.utime = cputime.utime - last_cputime->utime; - delta.cputime.stime = cputime.stime - last_cputime->stime; - delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - - last_cputime->sum_exec_runtime; - *last_cputime = cputime; - - /* transfer the pending stat into delta */ - cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); - memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); - - /* propagate delta into the global stat and the parent's pending */ - cgroup_base_stat_accumulate(&cgrp->bstat, &delta); - if (parent) - cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); + /* propagate percpu delta to global */ + delta = cur; + cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_add(&cgrp->bstat, &delta); + cgroup_base_stat_add(&rstatc->last_bstat, &delta); + + /* propagate global delta to parent */ + if (parent) { + delta = cgrp->bstat; + cgroup_base_stat_sub(&delta, &cgrp->last_bstat); + cgroup_base_stat_add(&parent->bstat, &delta); + cgroup_base_stat_add(&cgrp->last_bstat, &delta); + } } static struct cgroup_rstat_cpu * diff --git a/kernel/cpu.c b/kernel/cpu.c index fc28e17940e0..e2cad3ee2ead 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2373,7 +2373,18 @@ void __init boot_cpu_hotplug_init(void) this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; +/* + * These are used for a global "mitigations=" cmdline option for toggling + * optional CPU mitigations. + */ +enum cpu_mitigations { + CPU_MITIGATIONS_OFF, + CPU_MITIGATIONS_AUTO, + CPU_MITIGATIONS_AUTO_NOSMT, +}; + +static enum cpu_mitigations cpu_mitigations __ro_after_init = + CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) { @@ -2390,3 +2401,17 @@ static int __init mitigations_parse_cmdline(char *arg) return 0; } early_param("mitigations", mitigations_parse_cmdline); + +/* mitigations=off */ +bool cpu_mitigations_off(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_OFF; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_off); + +/* mitigations=auto,nosmt */ +bool cpu_mitigations_auto_nosmt(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 099002d84f46..a26170469543 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -161,7 +161,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) { #ifdef CONFIG_STACKTRACE if (entry) { - pr_warning("Mapped at:\n"); + pr_warn("Mapped at:\n"); stack_trace_print(entry->stack_entries, entry->stack_len, 0); } #endif diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8402b29c280f..0b67c04e531b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -16,12 +16,11 @@ #include <linux/swiotlb.h> /* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but - * some use it for entirely different regions: + * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it + * it for entirely different regions. In that case the arch code needs to + * override the variable below for dma-direct to work properly. */ -#ifndef ARCH_ZONE_DMA_BITS -#define ARCH_ZONE_DMA_BITS 24 -#endif +unsigned int zone_dma_bits __ro_after_init = 24; static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) { @@ -69,7 +68,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ - if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + if (*phys_mask <= DMA_BIT_MASK(zone_dma_bits)) return GFP_DMA; if (*phys_mask <= DMA_BIT_MASK(32)) return GFP_DMA32; @@ -395,7 +394,7 @@ int dma_direct_supported(struct device *dev, u64 mask) u64 min_mask; if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS); + min_mask = DMA_BIT_MASK(zone_dma_bits); else min_mask = DMA_BIT_MASK(32); diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index ca4e5d44b571..c00b9258fa6a 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -87,9 +87,9 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, */ void dma_common_free_remap(void *cpu_addr, size_t size) { - struct page **pages = dma_common_find_pages(cpu_addr); + struct vm_struct *area = find_vm_area(cpu_addr); - if (!pages) { + if (!area || area->flags != VM_DMA_COHERENT) { WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); return; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 4655adbbae10..2cba42957b35 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1031,7 +1031,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { } -void +static inline void perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } @@ -3779,11 +3779,23 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) perf_event_groups_insert(&ctx->flexible_groups, event); } +/* pick an event from the flexible_groups to rotate */ static inline struct perf_event * -ctx_first_active(struct perf_event_context *ctx) +ctx_event_to_rotate(struct perf_event_context *ctx) { - return list_first_entry_or_null(&ctx->flexible_active, - struct perf_event, active_list); + struct perf_event *event; + + /* pick the first active flexible event */ + event = list_first_entry_or_null(&ctx->flexible_active, + struct perf_event, active_list); + + /* if no active flexible event, pick the first event */ + if (!event) { + event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), + typeof(*event), group_node); + } + + return event; } static bool perf_rotate_context(struct perf_cpu_context *cpuctx) @@ -3808,9 +3820,9 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_disable(cpuctx->ctx.pmu); if (task_rotate) - task_event = ctx_first_active(task_ctx); + task_event = ctx_event_to_rotate(task_ctx); if (cpu_rotate) - cpu_event = ctx_first_active(&cpuctx->ctx); + cpu_event = ctx_event_to_rotate(&cpuctx->ctx); /* * As per the order given at ctx_resched() first 'pop' task flexible @@ -5017,6 +5029,24 @@ static void _perf_event_reset(struct perf_event *event) perf_event_update_userpage(event); } +/* Assume it's not an event with inherit set. */ +u64 perf_event_pause(struct perf_event *event, bool reset) +{ + struct perf_event_context *ctx; + u64 count; + + ctx = perf_event_ctx_lock(event); + WARN_ON_ONCE(event->attr.inherit); + _perf_event_disable(event); + count = local64_read(&event->count); + if (reset) + local64_set(&event->count, 0); + perf_event_ctx_unlock(event, ctx); + + return count; +} +EXPORT_SYMBOL_GPL(perf_event_pause); + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block @@ -5094,16 +5124,11 @@ static int perf_event_check_period(struct perf_event *event, u64 value) return event->pmu->check_period(event, value); } -static int perf_event_period(struct perf_event *event, u64 __user *arg) +static int _perf_event_period(struct perf_event *event, u64 value) { - u64 value; - if (!is_sampling_event(event)) return -EINVAL; - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - if (!value) return -EINVAL; @@ -5121,6 +5146,19 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) return 0; } +int perf_event_period(struct perf_event *event, u64 value) +{ + struct perf_event_context *ctx; + int ret; + + ctx = perf_event_ctx_lock(event); + ret = _perf_event_period(event, value); + perf_event_ctx_unlock(event, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(perf_event_period); + static const struct file_operations perf_fops; static inline int perf_fget_light(int fd, struct fd *p) @@ -5164,8 +5202,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon return _perf_event_refresh(event, arg); case PERF_EVENT_IOC_PERIOD: - return perf_event_period(event, (u64 __user *)arg); + { + u64 value; + + if (copy_from_user(&value, (u64 __user *)arg, sizeof(value))) + return -EFAULT; + return _perf_event_period(event, value); + } case PERF_EVENT_IOC_ID: { u64 id = primary_event_id(event); @@ -5595,8 +5639,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) perf_pmu_output_stop(event); /* now it's safe to free the pages */ - atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); + if (!rb->aux_mmap_locked) + atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + else + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); @@ -5668,7 +5714,8 @@ again: * undo the VM accounting. */ - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked, + &mmap_user->locked_vm); atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); @@ -5812,8 +5859,20 @@ accounting: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - if (user_locked > user_lock_limit) + if (user_locked <= user_lock_limit) { + /* charge all to locked_vm */ + } else if (atomic_long_read(&user->locked_vm) >= user_lock_limit) { + /* charge all to pinned_vm */ + extra = user_extra; + user_extra = 0; + } else { + /* + * charge locked_vm until it hits user_lock_limit; + * charge the rest from pinned_vm + */ extra = user_locked - user_lock_limit; + user_extra -= extra; + } lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; @@ -6922,7 +6981,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->pmu; + struct pmu *pmu = event->ctx->pmu; struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct remote_output ro = { .rb = event->rb, @@ -10508,6 +10567,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + /* + * Disallow uncore-cgroup events, they don't make sense as the cgroup will + * be different on other CPUs in the uncore mask. + */ + if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { + err = -EINVAL; + goto err_pmu; + } + if (event->attr.aux_output && !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { err = -EOPNOTSUPP; @@ -10586,58 +10654,29 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, u32 size; int ret; - if (!access_ok(uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ + /* Zero the full structure, so that a short copy will be nice. */ memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); if (ret) return ret; - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ + /* ABI compatibility quirk: */ + if (!size) size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) + if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - attr->size = size; - if (attr->__reserved_1) + if (attr->__reserved_1 || attr->__reserved_2) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) @@ -11325,8 +11364,11 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, int err; /* - * Get the target context (task or percpu): + * Grouping is not supported for kernel events, neither is 'AUX', + * make sure the caller's intentions are adjusted. */ + if (attr->aux_output) + return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler, context, -1); @@ -11338,6 +11380,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + /* + * Get the target context (task or percpu): + */ ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); @@ -11789,7 +11834,7 @@ inherit_event(struct perf_event *parent_event, GFP_KERNEL); if (!child_ctx->task_ctx_data) { free_event(child_event); - return NULL; + return ERR_PTR(-ENOMEM); } } @@ -11891,6 +11936,10 @@ static int inherit_group(struct perf_event *parent_event, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); + + if (sub->aux_event == parent_event && child_ctr && + !perf_get_aux_event(child_ctr, leader)) + return -EINVAL; } return 0; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 94d38a39d72e..c74761004ee5 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -474,14 +474,17 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; bool orig_page_huge = false; + unsigned int gup_flags = FOLL_FORCE; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); retry: + if (is_register) + gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ - ret = get_user_pages_remote(NULL, mm, vaddr, 1, - FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags, + &old_page, &vma, NULL); if (ret <= 0) return ret; @@ -489,6 +492,12 @@ retry: if (ret <= 0) goto put_old; + if (WARN(!is_register && PageCompound(old_page), + "uprobe unregister should never work on compound page\n")) { + ret = -EINVAL; + goto put_old; + } + /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d67002..f2d20ab74422 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1457,7 +1457,7 @@ repeat: */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && - (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) + (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; set_current_state(TASK_INTERRUPTIBLE); diff --git a/kernel/fork.c b/kernel/fork.c index f9572f416126..35f91ee91057 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1517,6 +1517,11 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(¤t->sighand->siglock); + + /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ + if (clone_flags & CLONE_CLEAR_SIGHAND) + flush_signal_handlers(tsk, 0); + return 0; } @@ -1695,12 +1700,68 @@ static int pidfd_release(struct inode *inode, struct file *file) } #ifdef CONFIG_PROC_FS +/** + * pidfd_show_fdinfo - print information about a pidfd + * @m: proc fdinfo file + * @f: file referencing a pidfd + * + * Pid: + * This function will print the pid that a given pidfd refers to in the + * pid namespace of the procfs instance. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its pid. This is + * similar to calling getppid() on a process whose parent is outside of + * its pid namespace. + * + * NSpid: + * If pid namespaces are supported then this function will also print + * the pid of a given pidfd refers to for all descendant pid namespaces + * starting from the current pid namespace of the instance, i.e. the + * Pid field and the first entry in the NSpid field will be identical. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its first NSpid + * entry and no others will be shown. + * Note that this differs from the Pid and NSpid fields in + * /proc/<pid>/status where Pid and NSpid are always shown relative to + * the pid namespace of the procfs instance. The difference becomes + * obvious when sending around a pidfd between pid namespaces from a + * different branch of the tree, i.e. where no ancestoral relation is + * present between the pid namespaces: + * - create two new pid namespaces ns1 and ns2 in the initial pid + * namespace (also take care to create new mount namespaces in the + * new pid namespace and mount procfs) + * - create a process with a pidfd in ns1 + * - send pidfd from ns1 to ns2 + * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid + * have exactly one entry, which is 0 + */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { - struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); struct pid *pid = f->private_data; + struct pid_namespace *ns; + pid_t nr = -1; + + if (likely(pid_has_task(pid, PIDTYPE_PID))) { + ns = proc_pid_ns(file_inode(m->file)); + nr = pid_nr_ns(pid, ns); + } + + seq_put_decimal_ll(m, "Pid:\t", nr); + +#ifdef CONFIG_PID_NS + seq_put_decimal_ll(m, "\nNSpid:\t", nr); + if (nr > 0) { + int i; - seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); + /* If nr is non-zero it means that 'pid' is valid and that + * ns, i.e. the pid namespace associated with the procfs + * instance, is in the pid namespace hierarchy of pid. + * Start at one below the already printed level. + */ + for (i = ns->level + 1; i <= pid->level; i++) + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); + } +#endif seq_putc(m, '\n'); } #endif @@ -1708,11 +1769,11 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) /* * Poll support for process exit notification. */ -static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) +static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct task_struct *task; struct pid *pid = file->private_data; - int poll_flags = 0; + __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); @@ -1724,7 +1785,7 @@ static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) * group, then poll(2) should block, similar to the wait(2) family. */ if (!task || (task->exit_state && thread_group_empty(task))) - poll_flags = POLLIN | POLLRDNORM; + poll_flags = EPOLLIN | EPOLLRDNORM; rcu_read_unlock(); return poll_flags; @@ -2026,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process( stackleak_task_init(p); if (pid != &init_struct_pid) { - pid = alloc_pid(p->nsproxy->pid_ns_for_children); + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, + args->set_tid_size); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; @@ -2525,39 +2587,29 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, #ifdef __ARCH_WANT_SYS_CLONE3 noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, - size_t size) + size_t usize) { + int err; struct clone_args args; + pid_t *kset_tid = kargs->set_tid; - if (unlikely(size > PAGE_SIZE)) + if (unlikely(usize > PAGE_SIZE)) return -E2BIG; - - if (unlikely(size < sizeof(struct clone_args))) + if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) return -EINVAL; - if (unlikely(!access_ok(uargs, size))) - return -EFAULT; - - if (size > sizeof(struct clone_args)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uargs + sizeof(struct clone_args); - end = (void __user *)uargs + size; + err = copy_struct_from_user(&args, sizeof(args), uargs, usize); + if (err) + return err; - for (; addr < end; addr++) { - if (get_user(val, addr)) - return -EFAULT; - if (val) - return -E2BIG; - } + if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) + return -EINVAL; - size = sizeof(struct clone_args); - } + if (unlikely(!args.set_tid && args.set_tid_size > 0)) + return -EINVAL; - if (copy_from_user(&args, uargs, size)) - return -EFAULT; + if (unlikely(args.set_tid && args.set_tid_size == 0)) + return -EINVAL; /* * Verify that higher 32bits of exit_signal are unset and that @@ -2576,18 +2628,51 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack = args.stack, .stack_size = args.stack_size, .tls = args.tls, + .set_tid_size = args.set_tid_size, }; + if (args.set_tid && + copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), + (kargs->set_tid_size * sizeof(pid_t)))) + return -EFAULT; + + kargs->set_tid = kset_tid; + return 0; } -static bool clone3_args_valid(const struct kernel_clone_args *kargs) +/** + * clone3_stack_valid - check and prepare stack + * @kargs: kernel clone args + * + * Verify that the stack arguments userspace gave us are sane. + * In addition, set the stack direction for userspace since it's easy for us to + * determine. + */ +static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) { - /* - * All lower bits of the flag word are taken. - * Verify that no other unknown flags are passed along. - */ - if (kargs->flags & ~CLONE_LEGACY_FLAGS) + if (kargs->stack == 0) { + if (kargs->stack_size > 0) + return false; + } else { + if (kargs->stack_size == 0) + return false; + + if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) + return false; + +#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64) + kargs->stack += kargs->stack_size; +#endif + } + + return true; +} + +static bool clone3_args_valid(struct kernel_clone_args *kargs) +{ + /* Verify that no unknown flags are passed along. */ + if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) return false; /* @@ -2597,18 +2682,39 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) return false; + if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == + (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) + return false; + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && kargs->exit_signal) return false; + if (!clone3_stack_valid(kargs)) + return false; + return true; } +/** + * clone3 - create a new process with specific properties + * @uargs: argument structure + * @size: size of @uargs + * + * clone3() is the extensible successor to clone()/clone2(). + * It takes a struct as argument that is versioned by its size. + * + * Return: On success, a positive PID for the child process. + * On error, a negative errno number. + */ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) { int err; struct kernel_clone_args kargs; + pid_t set_tid[MAX_PID_NS_LEVEL]; + + kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); if (err) @@ -2934,7 +3040,7 @@ int sysctl_max_threads(struct ctl_table *table, int write, struct ctl_table t; int ret; int threads = max_threads; - int min = MIN_THREADS; + int min = 1; int max = MAX_THREADS; t = *table; @@ -2946,7 +3052,7 @@ int sysctl_max_threads(struct ctl_table *table, int write, if (ret || !write) return ret; - set_max_threads(threads); + max_threads = threads; return 0; } diff --git a/kernel/freezer.c b/kernel/freezer.c index c0738424bb43..dc520f01f99d 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -22,12 +22,6 @@ EXPORT_SYMBOL(system_freezing_cnt); bool pm_freezing; bool pm_nosig_freezing; -/* - * Temporary export for the deadlock workaround in ata_scsi_hotplug(). - * Remove once the hack becomes unnecessary. - */ -EXPORT_SYMBOL_GPL(pm_freezing); - /* protects freezing and frozen transitions */ static DEFINE_SPINLOCK(freezer_lock); diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 9ff449888d9c..5a0fc0b0403a 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -71,7 +71,13 @@ done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 find $cpio_dir -type f -print0 | xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' -tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null +# Create archive and try to normalize metadata for reproducibility. +# For compatibility with older versions of tar, files are fed to tar +# pre-sorted, as --sort=name might not be available. +find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ + tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ + --owner=0 --group=0 --numeric-owner --no-recursion \ + -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null echo "$src_files_md5" > kernel/kheaders.md5 echo "$obj_files_md5" >> kernel/kheaders.md5 diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 132672b74e4b..dd822fd8a7d5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); * @type: Type of irqchip_fwnode. See linux/irqdomain.h * @name: Optional user provided domain name * @id: Optional user provided id if name != NULL - * @data: Optional user-provided data + * @pa: Optional user-provided physical address * * Allocate a struct irqchip_fwid, and return a poiner to the embedded * fwnode_handle (or NULL on failure). diff --git a/kernel/kthread.c b/kernel/kthread.c index 621467c33fef..b262f47046ca 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -866,9 +866,9 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); -void __kthread_queue_delayed_work(struct kthread_worker *worker, - struct kthread_delayed_work *dwork, - unsigned long delay) +static void __kthread_queue_delayed_work(struct kthread_worker *worker, + struct kthread_delayed_work *dwork, + unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index cf9b5bcdb952..cf03d4bdfc66 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_LIVEPATCH) += livepatch.o -livepatch-objs := core.o patch.o shadow.o transition.o +livepatch-objs := core.o patch.o shadow.o state.o transition.o diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ab4a4606d19b..c3512e7e0801 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -22,6 +22,7 @@ #include <asm/cacheflush.h> #include "core.h" #include "patch.h" +#include "state.h" #include "transition.h" /* @@ -632,7 +633,7 @@ static void klp_free_objects_dynamic(struct klp_patch *patch) * The operation must be completed by calling klp_free_patch_finish() * outside klp_mutex. */ -void klp_free_patch_start(struct klp_patch *patch) +static void klp_free_patch_start(struct klp_patch *patch) { if (!list_empty(&patch->list)) list_del(&patch->list); @@ -677,6 +678,23 @@ static void klp_free_patch_work_fn(struct work_struct *work) klp_free_patch_finish(patch); } +void klp_free_patch_async(struct klp_patch *patch) +{ + klp_free_patch_start(patch); + schedule_work(&patch->free_work); +} + +void klp_free_replaced_patches_async(struct klp_patch *new_patch) +{ + struct klp_patch *old_patch, *tmp_patch; + + klp_for_each_patch_safe(old_patch, tmp_patch) { + if (old_patch == new_patch) + return; + klp_free_patch_async(old_patch); + } +} + static int klp_init_func(struct klp_object *obj, struct klp_func *func) { if (!func->old_name) @@ -992,6 +1010,13 @@ int klp_enable_patch(struct klp_patch *patch) mutex_lock(&klp_mutex); + if (!klp_is_patch_compatible(patch)) { + pr_err("Livepatch patch (%s) is not compatible with the already installed livepatches.\n", + patch->mod->name); + mutex_unlock(&klp_mutex); + return -EINVAL; + } + ret = klp_init_patch_early(patch); if (ret) { mutex_unlock(&klp_mutex); @@ -1022,12 +1047,13 @@ err: EXPORT_SYMBOL_GPL(klp_enable_patch); /* - * This function removes replaced patches. + * This function unpatches objects from the replaced livepatches. * * We could be pretty aggressive here. It is called in the situation where - * these structures are no longer accessible. All functions are redirected - * by the klp_transition_patch. They use either a new code or they are in - * the original code because of the special nop function patches. + * these structures are no longer accessed from the ftrace handler. + * All functions are redirected by the klp_transition_patch. They + * use either a new code or they are in the original code because + * of the special nop function patches. * * The only exception is when the transition was forced. In this case, * klp_ftrace_handler() might still see the replaced patch on the stack. @@ -1035,18 +1061,16 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * thanks to RCU. We only have to keep the patches on the system. Also * this is handled transparently by patch->module_put. */ -void klp_discard_replaced_patches(struct klp_patch *new_patch) +void klp_unpatch_replaced_patches(struct klp_patch *new_patch) { - struct klp_patch *old_patch, *tmp_patch; + struct klp_patch *old_patch; - klp_for_each_patch_safe(old_patch, tmp_patch) { + klp_for_each_patch(old_patch) { if (old_patch == new_patch) return; old_patch->enabled = false; klp_unpatch_objects(old_patch); - klp_free_patch_start(old_patch); - schedule_work(&old_patch->free_work); } } diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index ec43a40b853f..38209c7361b6 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -13,8 +13,9 @@ extern struct list_head klp_patches; #define klp_for_each_patch(patch) \ list_for_each_entry(patch, &klp_patches, list) -void klp_free_patch_start(struct klp_patch *patch); -void klp_discard_replaced_patches(struct klp_patch *new_patch); +void klp_free_patch_async(struct klp_patch *patch); +void klp_free_replaced_patches_async(struct klp_patch *new_patch); +void klp_unpatch_replaced_patches(struct klp_patch *new_patch); void klp_discard_nops(struct klp_patch *new_patch); static inline bool klp_is_object_loaded(struct klp_object *obj) diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c new file mode 100644 index 000000000000..7ee19476de9d --- /dev/null +++ b/kernel/livepatch/state.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * system_state.c - State of the system modified by livepatches + * + * Copyright (C) 2019 SUSE + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/livepatch.h> +#include "core.h" +#include "state.h" +#include "transition.h" + +#define klp_for_each_state(patch, state) \ + for (state = patch->states; state && state->id; state++) + +/** + * klp_get_state() - get information about system state modified by + * the given patch + * @patch: livepatch that modifies the given system state + * @id: custom identifier of the modified system state + * + * Checks whether the given patch modifies the given system state. + * + * The function can be called either from pre/post (un)patch + * callbacks or from the kernel code added by the livepatch. + * + * Return: pointer to struct klp_state when found, otherwise NULL. + */ +struct klp_state *klp_get_state(struct klp_patch *patch, unsigned long id) +{ + struct klp_state *state; + + klp_for_each_state(patch, state) { + if (state->id == id) + return state; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(klp_get_state); + +/** + * klp_get_prev_state() - get information about system state modified by + * the already installed livepatches + * @id: custom identifier of the modified system state + * + * Checks whether already installed livepatches modify the given + * system state. + * + * The same system state can be modified by more non-cumulative + * livepatches. It is expected that the latest livepatch has + * the most up-to-date information. + * + * The function can be called only during transition when a new + * livepatch is being enabled or when such a transition is reverted. + * It is typically called only from from pre/post (un)patch + * callbacks. + * + * Return: pointer to the latest struct klp_state from already + * installed livepatches, NULL when not found. + */ +struct klp_state *klp_get_prev_state(unsigned long id) +{ + struct klp_patch *patch; + struct klp_state *state, *last_state = NULL; + + if (WARN_ON_ONCE(!klp_transition_patch)) + return NULL; + + klp_for_each_patch(patch) { + if (patch == klp_transition_patch) + goto out; + + state = klp_get_state(patch, id); + if (state) + last_state = state; + } + +out: + return last_state; +} +EXPORT_SYMBOL_GPL(klp_get_prev_state); + +/* Check if the patch is able to deal with the existing system state. */ +static bool klp_is_state_compatible(struct klp_patch *patch, + struct klp_state *old_state) +{ + struct klp_state *state; + + state = klp_get_state(patch, old_state->id); + + /* A cumulative livepatch must handle all already modified states. */ + if (!state) + return !patch->replace; + + return state->version >= old_state->version; +} + +/* + * Check that the new livepatch will not break the existing system states. + * Cumulative patches must handle all already modified states. + * Non-cumulative patches can touch already modified states. + */ +bool klp_is_patch_compatible(struct klp_patch *patch) +{ + struct klp_patch *old_patch; + struct klp_state *old_state; + + klp_for_each_patch(old_patch) { + klp_for_each_state(old_patch, old_state) { + if (!klp_is_state_compatible(patch, old_state)) + return false; + } + } + + return true; +} diff --git a/kernel/livepatch/state.h b/kernel/livepatch/state.h new file mode 100644 index 000000000000..49d9c16e8762 --- /dev/null +++ b/kernel/livepatch/state.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LIVEPATCH_STATE_H +#define _LIVEPATCH_STATE_H + +#include <linux/livepatch.h> + +bool klp_is_patch_compatible(struct klp_patch *patch); + +#endif /* _LIVEPATCH_STATE_H */ diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index cdf318d86dd6..f6310f848f34 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -78,7 +78,7 @@ static void klp_complete_transition(void) klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) { - klp_discard_replaced_patches(klp_transition_patch); + klp_unpatch_replaced_patches(klp_transition_patch); klp_discard_nops(klp_transition_patch); } @@ -446,14 +446,14 @@ void klp_try_complete_transition(void) klp_complete_transition(); /* - * It would make more sense to free the patch in + * It would make more sense to free the unused patches in * klp_complete_transition() but it is called also * from klp_cancel_transition(). */ - if (!patch->enabled) { - klp_free_patch_start(patch); - schedule_work(&patch->free_work); - } + if (!patch->enabled) + klp_free_patch_async(patch); + else if (patch->replace) + klp_free_replaced_patches_async(patch); } /* diff --git a/kernel/module.c b/kernel/module.c index ff2d7359a418..acf7962936c4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3222,7 +3222,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) #endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ - mod->ftrace_callsites = section_objs(info, "__mcount_loc", + mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION, sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif diff --git a/kernel/panic.c b/kernel/panic.c index 47e8ebccc22b..f470a038b05b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -180,6 +180,7 @@ void panic(const char *fmt, ...) * after setting panic_cpu) from invoking panic() again. */ local_irq_disable(); + preempt_disable_notrace(); /* * It's possible to come here directly from a panic-assertion and diff --git a/kernel/pid.c b/kernel/pid.c index 0a9f2e437217..2278e249141d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -157,7 +157,8 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(struct pid_namespace *ns) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, + size_t set_tid_size) { struct pid *pid; enum pid_type type; @@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns) struct upid *upid; int retval = -ENOMEM; + /* + * set_tid_size contains the size of the set_tid array. Starting at + * the most nested currently active PID namespace it tells alloc_pid() + * which PID to set for a process in that most nested PID namespace + * up to set_tid_size PID namespaces. It does not have to set the PID + * for a process in all nested PID namespaces but set_tid_size must + * never be greater than the current ns->level + 1. + */ + if (set_tid_size > ns->level + 1) + return ERR_PTR(-EINVAL); + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); @@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid->level = ns->level; for (i = ns->level; i >= 0; i--) { - int pid_min = 1; + int tid = 0; + + if (set_tid_size) { + tid = set_tid[ns->level - i]; + + retval = -EINVAL; + if (tid < 1 || tid >= pid_max) + goto out_free; + /* + * Also fail if a PID != 1 is requested and + * no PID 1 exists. + */ + if (tid != 1 && !tmp->child_reaper) + goto out_free; + retval = -EPERM; + if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN)) + goto out_free; + set_tid_size--; + } idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); - /* - * init really needs pid 1, but after reaching the maximum - * wrap back to RESERVED_PIDS - */ - if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) - pid_min = RESERVED_PIDS; - - /* - * Store a null pointer so find_pid_ns does not find - * a partially initialized PID (see below). - */ - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); + if (tid) { + nr = idr_alloc(&tmp->idr, NULL, tid, + tid + 1, GFP_ATOMIC); + /* + * If ENOSPC is returned it means that the PID is + * alreay in use. Return EEXIST in that case. + */ + if (nr == -ENOSPC) + nr = -EEXIST; + } else { + int pid_min = 1; + /* + * init really needs pid 1, but after reaching the + * maximum wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; + + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + } spin_unlock_irq(&pidmap_lock); idr_preload_end(); @@ -299,7 +341,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type, *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) - if (!hlist_empty(&pid->tasks[tmp])) + if (pid_has_task(pid, tmp)) return; free_pid(pid); @@ -497,7 +539,7 @@ static int pidfd_create(struct pid *pid) */ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { - int fd, ret; + int fd; struct pid *p; if (flags) @@ -510,13 +552,11 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) if (!p) return -ESRCH; - ret = 0; - rcu_read_lock(); - if (!pid_task(p, PIDTYPE_TGID)) - ret = -EINVAL; - rcu_read_unlock(); + if (pid_has_task(p, PIDTYPE_TGID)) + fd = pidfd_create(p); + else + fd = -EINVAL; - fd = ret ?: pidfd_create(p); put_pid(p); return fd; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a6a79f85c81a..d40017e79ebe 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -26,8 +26,6 @@ static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ -#define MAX_PID_NS_LEVEL 32 /* Write once array, filled from the beginning. */ static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; diff --git a/kernel/power/main.c b/kernel/power/main.c index e8710d179b35..e26de7af520b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,6 +15,7 @@ #include <linux/seq_file.h> #include <linux/suspend.h> #include <linux/syscalls.h> +#include <linux/pm_runtime.h> #include "power.h" diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9568a2fe7c11..a45cba7df0ae 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -650,3 +650,249 @@ static int __init pm_qos_power_init(void) } late_initcall(pm_qos_power_init); + +/* Definitions related to the frequency QoS below. */ + +/** + * freq_constraints_init - Initialize frequency QoS constraints. + * @qos: Frequency QoS constraints to initialize. + */ +void freq_constraints_init(struct freq_constraints *qos) +{ + struct pm_qos_constraints *c; + + c = &qos->min_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MIN_DEFAULT_VALUE; + c->type = PM_QOS_MAX; + c->notifiers = &qos->min_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); + + c = &qos->max_freq; + plist_head_init(&c->list); + c->target_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->default_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->no_constraint_value = FREQ_QOS_MAX_DEFAULT_VALUE; + c->type = PM_QOS_MIN; + c->notifiers = &qos->max_freq_notifiers; + BLOCKING_INIT_NOTIFIER_HEAD(c->notifiers); +} + +/** + * freq_qos_read_value - Get frequency QoS constraint for a given list. + * @qos: Constraints to evaluate. + * @type: QoS request type. + */ +s32 freq_qos_read_value(struct freq_constraints *qos, + enum freq_qos_req_type type) +{ + s32 ret; + + switch (type) { + case FREQ_QOS_MIN: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MIN_DEFAULT_VALUE : + pm_qos_read_value(&qos->min_freq); + break; + case FREQ_QOS_MAX: + ret = IS_ERR_OR_NULL(qos) ? + FREQ_QOS_MAX_DEFAULT_VALUE : + pm_qos_read_value(&qos->max_freq); + break; + default: + WARN_ON(1); + ret = 0; + } + + return ret; +} + +/** + * freq_qos_apply - Add/modify/remove frequency QoS request. + * @req: Constraint request to apply. + * @action: Action to perform (add/update/remove). + * @value: Value to assign to the QoS request. + */ +static int freq_qos_apply(struct freq_qos_request *req, + enum pm_qos_req_action action, s32 value) +{ + int ret; + + switch(req->type) { + case FREQ_QOS_MIN: + ret = pm_qos_update_target(&req->qos->min_freq, &req->pnode, + action, value); + break; + case FREQ_QOS_MAX: + ret = pm_qos_update_target(&req->qos->max_freq, &req->pnode, + action, value); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +/** + * freq_qos_add_request - Insert new frequency QoS request into a given list. + * @qos: Constraints to update. + * @req: Preallocated request object. + * @type: Request type. + * @value: Request value. + * + * Insert a new entry into the @qos list of requests, recompute the effective + * QoS constraint value for that list and initialize the @req object. The + * caller needs to save that object for later use in updates and removal. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_add_request(struct freq_constraints *qos, + struct freq_qos_request *req, + enum freq_qos_req_type type, s32 value) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !req) + return -EINVAL; + + if (WARN(freq_qos_request_active(req), + "%s() called for active request\n", __func__)) + return -EINVAL; + + req->qos = qos; + req->type = type; + ret = freq_qos_apply(req, PM_QOS_ADD_REQ, value); + if (ret < 0) { + req->qos = NULL; + req->type = 0; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_request); + +/** + * freq_qos_update_request - Modify existing frequency QoS request. + * @req: Request to modify. + * @new_value: New request value. + * + * Update an existing frequency QoS request along with the effective constraint + * value for the list of requests it belongs to. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) +{ + if (!req) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + if (req->pnode.prio == new_value) + return 0; + + return freq_qos_apply(req, PM_QOS_UPDATE_REQ, new_value); +} +EXPORT_SYMBOL_GPL(freq_qos_update_request); + +/** + * freq_qos_remove_request - Remove frequency QoS request from its list. + * @req: Request to remove. + * + * Remove the given frequency QoS request from the list of constraints it + * belongs to and recompute the effective constraint value for that list. + * + * Return 1 if the effective constraint value has changed, 0 if the effective + * constraint value has not changed, or a negative error code on failures. + */ +int freq_qos_remove_request(struct freq_qos_request *req) +{ + int ret; + + if (!req) + return -EINVAL; + + if (WARN(!freq_qos_request_active(req), + "%s() called for unknown object\n", __func__)) + return -EINVAL; + + ret = freq_qos_apply(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE); + req->qos = NULL; + req->type = 0; + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_remove_request); + +/** + * freq_qos_add_notifier - Add frequency QoS change notifier. + * @qos: List of requests to add the notifier to. + * @type: Request type. + * @notifier: Notifier block to add. + */ +int freq_qos_add_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_register(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_register(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_add_notifier); + +/** + * freq_qos_remove_notifier - Remove frequency QoS change notifier. + * @qos: List of requests to remove the notifier from. + * @type: Request type. + * @notifier: Notifier block to remove. + */ +int freq_qos_remove_notifier(struct freq_constraints *qos, + enum freq_qos_req_type type, + struct notifier_block *notifier) +{ + int ret; + + if (IS_ERR_OR_NULL(qos) || !notifier) + return -EINVAL; + + switch (type) { + case FREQ_QOS_MIN: + ret = blocking_notifier_chain_unregister(qos->min_freq.notifiers, + notifier); + break; + case FREQ_QOS_MAX: + ret = blocking_notifier_chain_unregister(qos->max_freq.notifiers, + notifier); + break; + default: + WARN_ON(1); + ret = -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(freq_qos_remove_notifier); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7880f4f64d0e..80b60ca7767f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include <asm/tlb.h> #include "../workqueue_internal.h" +#include "../../fs/io-wq.h" #include "../smpboot.h" #include "pelt.h" @@ -1065,7 +1066,7 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) * affecting a valid clamp bucket, the next time it's enqueued, * it will already see the updated clamp bucket value. */ - if (!p->uclamp[clamp_id].active) { + if (p->uclamp[clamp_id].active) { uclamp_rq_dec_id(rq, p, clamp_id); uclamp_rq_inc_id(rq, p, clamp_id); } @@ -1073,6 +1074,7 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id) task_rq_unlock(rq, p, &rf); } +#ifdef CONFIG_UCLAMP_TASK_GROUP static inline void uclamp_update_active_tasks(struct cgroup_subsys_state *css, unsigned int clamps) @@ -1091,7 +1093,6 @@ uclamp_update_active_tasks(struct cgroup_subsys_state *css, css_task_iter_end(&it); } -#ifdef CONFIG_UCLAMP_TASK_GROUP static void cpu_util_update_eff(struct cgroup_subsys_state *css); static void uclamp_update_root_tg(void) { @@ -3929,13 +3930,22 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } restart: +#ifdef CONFIG_SMP /* - * Ensure that we put DL/RT tasks before the pick loop, such that they - * can PULL higher prio tasks when we lower the RQ 'priority'. + * We must do the balancing pass before put_next_task(), such + * that when we release the rq->lock the task is in the same + * state as before we took rq->lock. + * + * We can terminate the balance pass as soon as we know there is + * a runnable task of @class priority or higher. */ - prev->sched_class->put_prev_task(rq, prev, rf); - if (!rq->nr_running) - newidle_balance(rq, rf); + for_class_range(class, prev->sched_class, &idle_sched_class) { + if (class->balance(rq, prev, rf)) + break; + } +#endif + + put_prev_task(rq, prev); for_each_class(class) { p = class->pick_next_task(rq, NULL, NULL); @@ -4103,9 +4113,12 @@ static inline void sched_submit_work(struct task_struct *tsk) * we disable preemption to avoid it calling schedule() again * in the possible wakeup of a kworker. */ - if (tsk->flags & PF_WQ_WORKER) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); - wq_worker_sleeping(tsk); + if (tsk->flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else + io_wq_worker_sleeping(tsk); preempt_enable_no_resched(); } @@ -4122,8 +4135,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & PF_WQ_WORKER) - wq_worker_running(tsk); + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); + else + io_wq_worker_running(tsk); + } } asmlinkage __visible void __sched schedule(void) @@ -5106,9 +5123,6 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a u32 size; int ret; - if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0)) - return -EFAULT; - /* Zero the full structure, so that a short copy will be nice: */ memset(attr, 0, sizeof(*attr)); @@ -5116,45 +5130,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a if (ret) return ret; - /* Bail out on silly large: */ - if (size > PAGE_SIZE) - goto err_size; - /* ABI compatibility quirk: */ if (!size) size = SCHED_ATTR_SIZE_VER0; - - if (size < SCHED_ATTR_SIZE_VER0) + if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); + ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); + if (ret) { + if (ret == -E2BIG) + goto err_size; + return ret; } - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && size < SCHED_ATTR_SIZE_VER1) return -EINVAL; @@ -5354,7 +5342,7 @@ sched_attr_copy_to_user(struct sched_attr __user *uattr, * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. - * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility. + * @usize: sizeof(attr) for fwd/bwd comp. * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, @@ -6039,10 +6027,11 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + __sched_fork(0, idle); + raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_lock(&rq->lock); - __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; @@ -6230,7 +6219,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) for_each_class(class) { next = class->pick_next_task(rq, NULL, NULL); if (next) { - next->sched_class->put_prev_task(rq, next, NULL); + next->sched_class->put_prev_task(rq, next); return next; } } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2305ce89a26c..46ed4e1383e2 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -740,7 +740,7 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); /* We might have scheduled out from guest path */ - if (current->flags & PF_VCPU) + if (tsk->flags & PF_VCPU) vtime_account_guest(tsk, vtime); else __vtime_account_system(tsk, vtime); @@ -783,7 +783,7 @@ void vtime_guest_enter(struct task_struct *tsk) */ write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk, vtime); - current->flags |= PF_VCPU; + tsk->flags |= PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -794,7 +794,7 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); - current->flags &= ~PF_VCPU; + tsk->flags &= ~PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2dc48720f189..a8a08030a8f7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1691,6 +1691,22 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_dl_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq); +} #endif /* CONFIG_SMP */ /* @@ -1758,45 +1774,28 @@ static struct task_struct * pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; + struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; - struct dl_rq *dl_rq; WARN_ON_ONCE(prev || rf); - dl_rq = &rq->dl; - - if (unlikely(!dl_rq->dl_nr_running)) + if (!sched_dl_runnable(rq)) return NULL; dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); - p = dl_task_of(dl_se); - set_next_task_dl(rq, p); - return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); - - if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet started the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_dl_task(rq); - rq_repin_lock(rq, rf); - } } /* @@ -2442,6 +2441,7 @@ const struct sched_class dl_sched_class = { .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP + .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, .set_cpus_allowed = set_cpus_allowed_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 83ab35e2374f..69a81a5709ff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4926,20 +4926,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); - new = (old * 147) / 128; /* ~115% */ - new = min(new, max_cfs_quota_period); - - cfs_b->period = ns_to_ktime(new); - - /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ - cfs_b->quota *= new; - cfs_b->quota = div64_u64(cfs_b->quota, old); - - pr_warn_ratelimited( - "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", - smp_processor_id(), - div_u64(new, NSEC_PER_USEC), - div_u64(cfs_b->quota, NSEC_PER_USEC)); + /* + * Grow period by a factor of 2 to avoid losing precision. + * Precision loss in the quota/period ratio can cause __cfs_schedulable + * to fail. + */ + new = old * 2; + if (new < max_cfs_quota_period) { + cfs_b->period = ns_to_ktime(new); + cfs_b->quota *= 2; + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } else { + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(old, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } /* reset count so we don't come right back in here */ count = 0; @@ -6562,6 +6570,15 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } + +static int +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + if (rq->nr_running) + return 1; + + return newidle_balance(rq, rf) != 0; +} #endif /* CONFIG_SMP */ static unsigned long wakeup_gran(struct sched_entity *se) @@ -6738,7 +6755,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - if (!cfs_rq->nr_running) + if (!sched_fair_runnable(rq)) goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6876,7 +6893,7 @@ idle: /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; @@ -7531,6 +7548,19 @@ static void update_blocked_averages(int cpu) update_rq_clock(rq); /* + * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure + * that RT, DL and IRQ signals have been updated before updating CFS. + */ + curr_class = rq->curr->sched_class; + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); + update_irq_load_avg(rq, 0); + + /* Don't need periodic decay once load/util_avg are null */ + if (others_have_blocked(rq)) + done = false; + + /* * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ @@ -7557,14 +7587,6 @@ static void update_blocked_averages(int cpu) done = false; } - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); - /* Don't need periodic decay once load/util_avg are null */ - if (others_have_blocked(rq)) - done = false; - update_blocked_load_status(rq, !done); rq_unlock_irqrestore(rq, &rf); } @@ -7625,12 +7647,18 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + /* + * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure + * that RT, DL and IRQ signals have been updated before updating CFS. + */ curr_class = rq->curr->sched_class; update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); + + update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); rq_unlock_irqrestore(rq, &rf); } @@ -10406,11 +10434,11 @@ const struct sched_class fair_sched_class = { .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, - .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, #ifdef CONFIG_SMP + .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8dad5aa600ea..f65ef1e2f204 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -365,6 +365,12 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static int +balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return WARN_ON_ONCE(1); +} #endif /* @@ -375,7 +381,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { } @@ -460,6 +466,7 @@ const struct sched_class idle_sched_class = { .set_next_task = set_next_task_idle, #ifdef CONFIG_SMP + .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index a39bed2c784f..168479a7d61b 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -174,7 +174,6 @@ static int membarrier_private_expedited(int flags) */ if (cpu == raw_smp_processor_id()) continue; - rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ebaa4e619684..9b8adc01be3d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1469,6 +1469,22 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) resched_curr(rq); } +static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +{ + if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet started the picking loop. + */ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); + } + + return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); +} #endif /* CONFIG_SMP */ /* @@ -1552,21 +1568,18 @@ static struct task_struct * pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *p; - struct rt_rq *rt_rq = &rq->rt; WARN_ON_ONCE(prev || rf); - if (!rt_rq->rt_queued) + if (!sched_rt_runnable(rq)) return NULL; p = _pick_next_task_rt(rq); - set_next_task_rt(rq, p); - return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); @@ -1578,18 +1591,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_fla */ if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet started the picking loop. - */ - rq_unpin_lock(rq, rf); - pull_rt_task(rq); - rq_repin_lock(rq, rf); - } } #ifdef CONFIG_SMP @@ -2366,8 +2367,8 @@ const struct sched_class rt_sched_class = { .set_next_task = set_next_task_rt, #ifdef CONFIG_SMP + .balance = balance_rt, .select_task_rq = select_task_rq_rt, - .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0db2c1b3361e..c8870c5bd7df 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1727,10 +1727,11 @@ struct sched_class { struct task_struct * (*pick_next_task)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf); + void (*put_prev_task)(struct rq *rq, struct task_struct *p); void (*set_next_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); @@ -1773,7 +1774,7 @@ struct sched_class { static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->curr != prev); - prev->sched_class->put_prev_task(rq, prev, NULL); + prev->sched_class->put_prev_task(rq, prev); } static inline void set_next_task(struct rq *rq, struct task_struct *next) @@ -1787,8 +1788,12 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) #else #define sched_class_highest (&dl_sched_class) #endif + +#define for_class_range(class, _from, _to) \ + for (class = (_from); class != (_to); class = class->next) + #define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) + for_class_range(class, sched_class_highest, NULL) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; @@ -1796,6 +1801,25 @@ extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; +static inline bool sched_stop_runnable(struct rq *rq) +{ + return rq->stop && task_on_rq_queued(rq->stop); +} + +static inline bool sched_dl_runnable(struct rq *rq) +{ + return rq->dl.dl_nr_running > 0; +} + +static inline bool sched_rt_runnable(struct rq *rq) +{ + return rq->rt.rt_queued > 0; +} + +static inline bool sched_fair_runnable(struct rq *rq) +{ + return rq->cfs.nr_running > 0; +} #ifdef CONFIG_SMP diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7e1cee4e65b2..c0640739e05e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -15,6 +15,12 @@ select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } + +static int +balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + return sched_stop_runnable(rq); +} #endif /* CONFIG_SMP */ static void @@ -31,16 +37,13 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop) static struct task_struct * pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *stop = rq->stop; - WARN_ON_ONCE(prev || rf); - if (!stop || !task_on_rq_queued(stop)) + if (!sched_stop_runnable(rq)) return NULL; - set_next_task_stop(rq, stop); - - return stop; + set_next_task_stop(rq, rq->stop); + return rq->stop; } static void @@ -60,7 +63,7 @@ static void yield_task_stop(struct rq *rq) BUG(); /* the stop task should never yield, its pointless. */ } -static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *curr = rq->curr; u64 delta_exec; @@ -129,6 +132,7 @@ const struct sched_class stop_sched_class = { .set_next_task = set_next_task_stop, #ifdef CONFIG_SMP + .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b5667a273bf6..49b835f1305f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1948,7 +1948,7 @@ next_level: static int build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { - enum s_alloc alloc_state; + enum s_alloc alloc_state = sa_none; struct sched_domain *sd; struct s_data d; struct rq *rq = NULL; @@ -1956,6 +1956,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct sched_domain_topology_level *tl_asym; bool has_asym = false; + if (WARN_ON(cpumask_empty(cpu_map))) + goto error; + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; @@ -2026,7 +2029,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att rcu_read_unlock(); if (has_asym) - static_branch_enable_cpuslocked(&sched_asym_cpucapacity); + static_branch_inc_cpuslocked(&sched_asym_cpucapacity); if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", @@ -2121,8 +2124,12 @@ int sched_init_domains(const struct cpumask *cpu_map) */ static void detach_destroy_domains(const struct cpumask *cpu_map) { + unsigned int cpu = cpumask_any(cpu_map); int i; + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) + static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); diff --git a/kernel/signal.c b/kernel/signal.c index c4da1ef56fdf..bcd46f547db3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2205,8 +2205,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t */ preempt_disable(); read_unlock(&tasklist_lock); - preempt_enable_no_resched(); cgroup_enter_frozen(); + preempt_enable_no_resched(); freezable_schedule(); cgroup_leave_frozen(true); } else { diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 6d1f68b7e528..c9ea7eb2cb1a 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -141,7 +141,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, struct stacktrace_cookie c = { .store = store, .size = size, - .skip = skipnr + 1, + /* skip this function if they are tracing us */ + .skip = skipnr + !!(current == tsk), }; if (!try_get_task_stack(tsk)) @@ -298,7 +299,8 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, struct stack_trace trace = { .entries = store, .max_entries = size, - .skip = skipnr + 1, + /* skip this function if they are tracing us */ + .skip = skipnr + !!(current == task), }; save_stack_trace_tsk(task, &trace); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c7031a22aa7b..998d50ee2d9b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -7,6 +7,7 @@ * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> */ +#include <linux/compiler.h> #include <linux/completion.h> #include <linux/cpu.h> #include <linux/init.h> @@ -167,7 +168,7 @@ static void set_state(struct multi_stop_data *msdata, /* Reset ack counter. */ atomic_set(&msdata->thread_ack, msdata->num_threads); smp_wmb(); - msdata->state = newstate; + WRITE_ONCE(msdata->state, newstate); } /* Last one to ack a state moves to the next state. */ @@ -186,7 +187,7 @@ void __weak stop_machine_yield(const struct cpumask *cpumask) static int multi_cpu_stop(void *data) { struct multi_stop_data *msdata = data; - enum multi_stop_state curstate = MULTI_STOP_NONE; + enum multi_stop_state newstate, curstate = MULTI_STOP_NONE; int cpu = smp_processor_id(), err = 0; const struct cpumask *cpumask; unsigned long flags; @@ -210,8 +211,9 @@ static int multi_cpu_stop(void *data) do { /* Chill out and ensure we re-read multi_stop_state. */ stop_machine_yield(cpumask); - if (msdata->state != curstate) { - curstate = msdata->state; + newstate = READ_ONCE(msdata->state); + if (newstate != curstate) { + curstate = newstate; switch (curstate) { case MULTI_STOP_DISABLE_IRQ: local_irq_disable(); diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c new file mode 100644 index 000000000000..2a63241a8453 --- /dev/null +++ b/kernel/sysctl-test.c @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of proc sysctl. + */ + +#include <kunit/test.h> +#include <linux/sysctl.h> + +#define KUNIT_PROC_READ 0 +#define KUNIT_PROC_WRITE 1 + +static int i_zero; +static int i_one_hundred = 100; + +/* + * Test that proc_dointvec will not try to use a NULL .data field even when the + * length is non-zero. + */ +static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test) +{ + struct ctl_table null_data_table = { + .procname = "foo", + /* + * Here we are testing that proc_dointvec behaves correctly when + * we give it a NULL .data field. Normally this would point to a + * piece of memory where the value would be stored. + */ + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + /* + * proc_dointvec expects a buffer in user space, so we allocate one. We + * also need to cast it to __user so sparse doesn't get mad. + */ + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + size_t len; + loff_t pos; + + /* + * We don't care what the starting length is since proc_dointvec should + * not try to read because .data is NULL. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table, + KUNIT_PROC_READ, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); + + /* + * See above. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&null_data_table, + KUNIT_PROC_WRITE, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Similar to the previous test, we create a struct ctrl_table that has a .data + * field that proc_dointvec cannot do anything with; however, this time it is + * because we tell proc_dointvec that the size is 0. + */ +static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test) +{ + int data = 0; + struct ctl_table data_maxlen_unset_table = { + .procname = "foo", + .data = &data, + /* + * So .data is no longer NULL, but we tell proc_dointvec its + * length is 0, so it still shouldn't try to use it. + */ + .maxlen = 0, + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + size_t len; + loff_t pos; + + /* + * As before, we don't care what buffer length is because proc_dointvec + * cannot do anything because its internal .data buffer has zero length. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table, + KUNIT_PROC_READ, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); + + /* + * See previous comment. + */ + len = 1234; + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&data_maxlen_unset_table, + KUNIT_PROC_WRITE, buffer, &len, + &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Here we provide a valid struct ctl_table, but we try to read and write from + * it using a buffer of zero length, so it should still fail in a similar way as + * before. + */ +static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + /* + * However, now our read/write buffer has zero length. + */ + size_t len = 0; + loff_t pos; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer, + &len, &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, buffer, + &len, &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Test that proc_dointvec refuses to read when the file position is non-zero. + */ +static void sysctl_test_api_dointvec_table_read_but_position_set( + struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), + GFP_USER); + /* + * We don't care about our buffer length because we start off with a + * non-zero file position. + */ + size_t len = 1234; + /* + * proc_dointvec should refuse to read into the buffer since the file + * pos is non-zero. + */ + loff_t pos = 1; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, buffer, + &len, &pos)); + KUNIT_EXPECT_EQ(test, (size_t)0, len); +} + +/* + * Test that we can read a two digit number in a sufficiently size buffer. + * Nothing fancy. + */ +static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t len = 4; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + /* Store 13 in the data field. */ + *((int *)table.data) = 13; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, + user_buffer, &len, &pos)); + KUNIT_ASSERT_EQ(test, (size_t)3, len); + buffer[len] = '\0'; + /* And we read 13 back out. */ + KUNIT_EXPECT_STREQ(test, "13\n", buffer); +} + +/* + * Same as previous test, just now with negative numbers. + */ +static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t len = 5; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + *((int *)table.data) = -16; + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_READ, + user_buffer, &len, &pos)); + KUNIT_ASSERT_EQ(test, (size_t)4, len); + buffer[len] = '\0'; + KUNIT_EXPECT_STREQ(test, "-16\n", (char *)buffer); +} + +/* + * Test that a simple positive write works. + */ +static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test) +{ + int data = 0; + /* Good table. */ + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + char input[] = "9"; + size_t len = sizeof(input) - 1; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + + memcpy(buffer, input, len); + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos); + KUNIT_EXPECT_EQ(test, 9, *((int *)table.data)); +} + +/* + * Same as previous test, but now with negative numbers. + */ +static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test) +{ + int data = 0; + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + char input[] = "-9"; + size_t len = sizeof(input) - 1; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + + memcpy(buffer, input, len); + + KUNIT_EXPECT_EQ(test, 0, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, len); + KUNIT_EXPECT_EQ(test, sizeof(input) - 1, (size_t)pos); + KUNIT_EXPECT_EQ(test, -9, *((int *)table.data)); +} + +/* + * Test that writing a value smaller than the minimum possible value is not + * allowed. + */ +static void sysctl_test_api_dointvec_write_single_less_int_min( + struct kunit *test) +{ + int data = 0; + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t max_len = 32, len = max_len; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, max_len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + unsigned long abs_of_less_than_min = (unsigned long)INT_MAX + - (INT_MAX + INT_MIN) + 1; + + /* + * We use this rigmarole to create a string that contains a value one + * less than the minimum accepted value. + */ + KUNIT_ASSERT_LT(test, + (size_t)snprintf(buffer, max_len, "-%lu", + abs_of_less_than_min), + max_len); + + KUNIT_EXPECT_EQ(test, -EINVAL, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_EXPECT_EQ(test, max_len, len); + KUNIT_EXPECT_EQ(test, 0, *((int *)table.data)); +} + +/* + * Test that writing the maximum possible value works. + */ +static void sysctl_test_api_dointvec_write_single_greater_int_max( + struct kunit *test) +{ + int data = 0; + struct ctl_table table = { + .procname = "foo", + .data = &data, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &i_zero, + .extra2 = &i_one_hundred, + }; + size_t max_len = 32, len = max_len; + loff_t pos = 0; + char *buffer = kunit_kzalloc(test, max_len, GFP_USER); + char __user *user_buffer = (char __user *)buffer; + unsigned long greater_than_max = (unsigned long)INT_MAX + 1; + + KUNIT_ASSERT_GT(test, greater_than_max, (unsigned long)INT_MAX); + KUNIT_ASSERT_LT(test, (size_t)snprintf(buffer, max_len, "%lu", + greater_than_max), + max_len); + KUNIT_EXPECT_EQ(test, -EINVAL, proc_dointvec(&table, KUNIT_PROC_WRITE, + user_buffer, &len, &pos)); + KUNIT_ASSERT_EQ(test, max_len, len); + KUNIT_EXPECT_EQ(test, 0, *((int *)table.data)); +} + +static struct kunit_case sysctl_test_cases[] = { + KUNIT_CASE(sysctl_test_api_dointvec_null_tbl_data), + KUNIT_CASE(sysctl_test_api_dointvec_table_maxlen_unset), + KUNIT_CASE(sysctl_test_api_dointvec_table_len_is_zero), + KUNIT_CASE(sysctl_test_api_dointvec_table_read_but_position_set), + KUNIT_CASE(sysctl_test_dointvec_read_happy_single_positive), + KUNIT_CASE(sysctl_test_dointvec_read_happy_single_negative), + KUNIT_CASE(sysctl_test_dointvec_write_happy_single_positive), + KUNIT_CASE(sysctl_test_dointvec_write_happy_single_negative), + KUNIT_CASE(sysctl_test_api_dointvec_write_single_less_int_min), + KUNIT_CASE(sysctl_test_api_dointvec_write_single_greater_int_max), + {} +}; + +static struct kunit_suite sysctl_test_suite = { + .name = "sysctl_test", + .test_cases = sysctl_test_cases, +}; + +kunit_test_suite(sysctl_test_suite); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 00fcea236eba..b6f2f35d0bcf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -163,7 +163,7 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_SPARC #endif -#ifdef __hppa__ +#ifdef CONFIG_PARISC extern int pwrsw_enabled; #endif @@ -620,7 +620,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef __hppa__ +#ifdef CONFIG_PARISC { .procname = "soft-power", .data = &pwrsw_enabled, diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0d4dc241c0fb..65605530ee34 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -164,7 +164,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, struct hrtimer_clock_base *base; for (;;) { - base = timer->base; + base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) @@ -244,7 +244,7 @@ again: return base; /* See the comment in lock_hrtimer_base() */ - timer->base = &migration_base; + WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); @@ -253,10 +253,10 @@ again: raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; - timer->base = base; + WRITE_ONCE(timer->base, base); goto again; } - timer->base = new_base; + WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 65eb796610dc..069ca78fb0bf 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -771,7 +771,7 @@ int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, /* fill PPS status fields */ pps_fill_timex(txc); - txc->time.tv_sec = (time_t)ts->tv_sec; + txc->time.tv_sec = ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 92a431981b1c..42d512fcfda2 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -266,7 +266,7 @@ static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, /** * thread_group_sample_cputime - Sample cputime for a given task * @tsk: Task for which cputime needs to be started - * @iimes: Storage for time samples + * @samples: Storage for time samples * * Called from sys_getitimer() to calculate the expiry time of an active * timer. That means group cputime accounting is already active. Called @@ -1038,12 +1038,12 @@ unlock: * member of @pct->bases[CLK].nextevt. False otherwise */ static inline bool -task_cputimers_expired(const u64 *sample, struct posix_cputimers *pct) +task_cputimers_expired(const u64 *samples, struct posix_cputimers *pct) { int i; for (i = 0; i < CPUCLOCK_MAX; i++) { - if (sample[i] >= pct->bases[i].nextevt) + if (samples[i] >= pct->bases[i].nextevt) return true; } return false; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 142b07619918..dbd69052eaa6 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -17,6 +17,8 @@ #include <linux/seqlock.h> #include <linux/bitops.h> +#include "timekeeping.h" + /** * struct clock_read_data - data required to read from sched_clock() * diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index c1f5bb590b5e..b5a65e212df2 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -42,39 +42,39 @@ static int bc_shutdown(struct clock_event_device *evt) */ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) { - int bc_moved; /* - * We try to cancel the timer first. If the callback is on - * flight on some other cpu then we let it handle it. If we - * were able to cancel the timer nothing can rearm it as we - * own broadcast_lock. + * This is called either from enter/exit idle code or from the + * broadcast handler. In all cases tick_broadcast_lock is held. * - * However we can also be called from the event handler of - * ce_broadcast_hrtimer itself when it expires. We cannot - * restart the timer because we are in the callback, but we - * can set the expiry time and let the callback return - * HRTIMER_RESTART. + * hrtimer_cancel() cannot be called here neither from the + * broadcast handler nor from the enter/exit idle code. The idle + * code can run into the problem described in bc_shutdown() and the + * broadcast handler cannot wait for itself to complete for obvious + * reasons. * - * Since we are in the idle loop at this point and because - * hrtimer_{start/cancel} functions call into tracing, - * calls to these functions must be bound within RCU_NONIDLE. + * Each caller tries to arm the hrtimer on its own CPU, but if the + * hrtimer callbback function is currently running, then + * hrtimer_start() cannot move it and the timer stays on the CPU on + * which it is assigned at the moment. + * + * As this can be called from idle code, the hrtimer_start() + * invocation has to be wrapped with RCU_NONIDLE() as + * hrtimer_start() can call into tracing. */ - RCU_NONIDLE( - { - bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; - if (bc_moved) { - hrtimer_start(&bctimer, expires, - HRTIMER_MODE_ABS_PINNED_HARD); - } - } - ); - - if (bc_moved) { - /* Bind the "device" to the cpu */ - bc->bound_on = smp_processor_id(); - } else if (bc->bound_on == smp_processor_id()) { - hrtimer_set_expires(&bctimer, expires); - } + RCU_NONIDLE( { + hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* + * The core tick broadcast mode expects bc->bound_on to be set + * correctly to prevent a CPU which has the broadcast hrtimer + * armed from going deep idle. + * + * As tick_broadcast_lock is held, nothing can change the cpu + * base which was just established in hrtimer_start() above. So + * the below access is safe even without holding the hrtimer + * base lock. + */ + bc->bound_on = bctimer.base->cpu_base->cpu; + } ); return 0; } @@ -100,10 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - if (clockevent_state_oneshot(&ce_broadcast_hrtimer)) - if (ce_broadcast_hrtimer.next_event != KTIME_MAX) - return HRTIMER_RESTART; - return HRTIMER_NORESTART; } diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 4bc37ac3bb05..5ee0f7709410 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -110,8 +110,7 @@ void update_vsyscall(struct timekeeper *tk) nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); - if (__arch_use_vsyscall(vdata)) - update_vdso_data(vdata, tk); + update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); @@ -124,10 +123,8 @@ void update_vsyscall_tz(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); - if (__arch_use_vsyscall(vdata)) { - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; - } + vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; + vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_data(vdata); } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2d6e93ab0478..475e29498bca 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -64,8 +64,7 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len, - union kernfs_node_id *cgid) + const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -73,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; @@ -100,8 +99,8 @@ record_it: t->pid = pid; t->cpu = cpu; t->pdu_len = len + cgid_len; - if (cgid) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) @@ -122,7 +121,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm), NULL); + sizeof(tsk->comm), 0); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,7 +138,7 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), 0); local_irq_restore(flags); } @@ -172,9 +171,9 @@ void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, blkcg = NULL; #ifdef CONFIG_BLK_CGROUP trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, - blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); + blkcg ? cgroup_id(blkcg->css.cgroup) : 1); #else - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, 0); #endif local_irq_restore(flags); } @@ -212,7 +211,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data, union kernfs_node_id *cgid) + void *pdu_data, u64 cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -223,7 +222,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; - ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; + ssize_t cgid_len = cgid ? sizeof(cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -294,7 +293,7 @@ record_it: t->pdu_len = pdu_len + cgid_len; if (cgid_len) - memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); if (pdu_len) memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); @@ -751,31 +750,29 @@ void blk_trace_shutdown(struct request_queue *q) } #ifdef CONFIG_BLK_CGROUP -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { struct blk_trace *bt = q->blk_trace; if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) - return NULL; + return 0; if (!bio->bi_blkg) - return NULL; - return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup); + return 0; + return cgroup_id(bio_blkcg(bio)->css.cgroup); } #else -static union kernfs_node_id * -blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - return NULL; + return 0; } #endif -static union kernfs_node_id * +static u64 blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) { if (!rq->bio) - return NULL; + return 0; /* Use the first bio */ return blk_trace_bio_get_cgid(q, rq->bio); } @@ -797,8 +794,7 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what, - union kernfs_node_id *cgid) + unsigned int nr_bytes, u32 what, u64 cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -913,7 +909,7 @@ static void blk_add_trace_getrq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL, NULL); + NULL, 0); } } @@ -929,7 +925,7 @@ static void blk_add_trace_sleeprq(void *ignore, if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL, NULL); + 0, 0, NULL, 0); } } @@ -938,7 +934,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -955,7 +951,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } } @@ -1172,19 +1168,17 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { - return (void *)(te_blk_io_trace(ent) + 1) + - (has_cg ? sizeof(union kernfs_node_id) : 0); + return (void *)(te_blk_io_trace(ent) + 1) + (has_cg ? sizeof(u64) : 0); } -static inline const void *cgid_start(const struct trace_entry *ent) +static inline u64 t_cgid(const struct trace_entry *ent) { - return (void *)(te_blk_io_trace(ent) + 1); + return *(u64 *)(te_blk_io_trace(ent) + 1); } static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent)->pdu_len - - (has_cg ? sizeof(union kernfs_node_id) : 0); + return te_blk_io_trace(ent)->pdu_len - (has_cg ? sizeof(u64) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1257,7 +1251,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, fill_rwbs(rwbs, t); if (has_cg) { - const union kernfs_node_id *id = cgid_start(iter->ent); + u64 id = t_cgid(iter->ent); if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { char blkcg_name_buf[NAME_MAX + 1] = "<...>"; @@ -1267,11 +1261,25 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", MAJOR(t->device), MINOR(t->device), blkcg_name_buf, act, rwbs); - } else + } else { + /* + * The cgid portion used to be "INO,GEN". Userland + * builds a FILEID_INO32_GEN fid out of them and + * opens the cgroup using open_by_handle_at(2). + * While 32bit ino setups are still the same, 64bit + * ones now use the 64bit ino as the whole ID and + * no longer use generation. + * + * Regarldess of the content, always output + * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can + * be mapped back to @id on both 64 and 32bit ino + * setups. See __kernfs_fh_to_dentry(). + */ trace_seq_printf(&iter->seq, - "%3d,%-3d %x,%-x %2s %3s ", + "%3d,%-3d %llx,%-llx %2s %3s ", MAJOR(t->device), MINOR(t->device), - id->ino, id->generation, act, rwbs); + id & U32_MAX, id >> 32, act, rwbs); + } } else trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", MAJOR(t->device), MINOR(t->device), act, rwbs); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 62a50bf399d6..5259d4dea675 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -18,6 +18,7 @@ #include <linux/clocksource.h> #include <linux/sched/task.h> #include <linux/kallsyms.h> +#include <linux/security.h> #include <linux/seq_file.h> #include <linux/tracefs.h> #include <linux/hardirq.h> @@ -2493,14 +2494,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) } static int -ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) +ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec) { int ret; if (unlikely(ftrace_disabled)) return 0; - ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); + ret = ftrace_init_nop(mod, rec); if (ret) { ftrace_bug_type = FTRACE_BUG_INIT; ftrace_bug(ret, rec); @@ -2942,7 +2943,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) * to the NOP instructions. */ if (!__is_defined(CC_USING_NOP_MCOUNT) && - !ftrace_code_disable(mod, p)) + !ftrace_nop_initialize(mod, p)) break; update_cnt++; @@ -3486,6 +3487,11 @@ static int ftrace_avail_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; if (unlikely(ftrace_disabled)) return -ENODEV; @@ -3505,6 +3511,15 @@ ftrace_enabled_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; + /* + * This shows us what functions are currently being + * traced and by what. Not sure if we want lockdown + * to hide such critical information for an admin. + * Although, perhaps it can show information we don't + * want people to see, but if something is tracing + * something, we probably want to know about it. + */ + iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); if (!iter) return -ENOMEM; @@ -3540,21 +3555,22 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, struct ftrace_hash *hash; struct list_head *mod_head; struct trace_array *tr = ops->private; - int ret = 0; + int ret = -ENOMEM; ftrace_ops_init(ops); if (unlikely(ftrace_disabled)) return -ENODEV; + if (tracing_check_open_get_tr(tr)) + return -ENODEV; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) - return -ENOMEM; + goto out; - if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { - kfree(iter); - return -ENOMEM; - } + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) + goto out; iter->ops = ops; iter->flags = flag; @@ -3584,13 +3600,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, if (!iter->hash) { trace_parser_put(&iter->parser); - kfree(iter); - ret = -ENOMEM; goto out_unlock; } } else iter->hash = hash; + ret = 0; + if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -3602,7 +3618,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, /* Failed */ free_ftrace_hash(iter->hash); trace_parser_put(&iter->parser); - kfree(iter); } } else file->private_data = iter; @@ -3610,6 +3625,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, out_unlock: mutex_unlock(&ops->func_hash->regex_lock); + out: + if (ret) { + kfree(iter); + if (tr) + trace_array_put(tr); + } + return ret; } @@ -3618,6 +3640,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) { struct ftrace_ops *ops = inode->i_private; + /* Checks for tracefs lockdown */ return ftrace_regex_open(ops, FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES, inode, file); @@ -3628,6 +3651,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) { struct ftrace_ops *ops = inode->i_private; + /* Checks for tracefs lockdown */ return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE, inode, file); } @@ -5037,6 +5061,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_unlock(&iter->ops->func_hash->regex_lock); free_ftrace_hash(iter->hash); + if (iter->tr) + trace_array_put(iter->tr); kfree(iter); return 0; @@ -5194,9 +5220,13 @@ static int __ftrace_graph_open(struct inode *inode, struct file *file, struct ftrace_graph_data *fgd) { - int ret = 0; + int ret; struct ftrace_hash *new_hash = NULL; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if (file->f_mode & FMODE_WRITE) { const int size_bits = FTRACE_HASH_DEFAULT_BITS; @@ -6537,8 +6567,9 @@ ftrace_pid_open(struct inode *inode, struct file *file) struct seq_file *m; int ret = 0; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 252f79c435f8..6a0ee9178365 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -17,6 +17,7 @@ #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> +#include <linux/security.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/irqflags.h> @@ -304,6 +305,23 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } +int tracing_check_open_get_tr(struct trace_array *tr) +{ + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + if (tracing_disabled) + return -ENODEV; + + if (tr && trace_array_get(tr) < 0) + return -ENODEV; + + return 0; +} + int call_filter_check_discard(struct trace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) @@ -4140,8 +4158,11 @@ release: int tracing_open_generic(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; filp->private_data = inode->i_private; return 0; @@ -4156,15 +4177,14 @@ bool tracing_is_disabled(void) * Open and update trace_array ref count. * Must have the current trace_array passed to it. */ -static int tracing_open_generic_tr(struct inode *inode, struct file *filp) +int tracing_open_generic_tr(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; + int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; filp->private_data = inode->i_private; @@ -4233,10 +4253,11 @@ static int tracing_open(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; - int ret = 0; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { @@ -4352,12 +4373,15 @@ static int show_traces_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; - if (tracing_disabled) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = seq_open(file, &show_traces_seq_ops); - if (ret) + if (ret) { + trace_array_put(tr); return ret; + } m = file->private_data; m->private = tr; @@ -4365,6 +4389,14 @@ static int show_traces_open(struct inode *inode, struct file *file) return 0; } +static int show_traces_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + return seq_release(inode, file); +} + static ssize_t tracing_write_stub(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) @@ -4395,8 +4427,8 @@ static const struct file_operations tracing_fops = { static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, - .release = seq_release, .llseek = seq_lseek, + .release = show_traces_release, }; static ssize_t @@ -4697,11 +4729,9 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = single_open(file, tracing_trace_options_show, inode->i_private); if (ret < 0) @@ -5038,8 +5068,11 @@ static const struct seq_operations tracing_saved_tgids_seq_ops = { static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; return seq_open(filp, &tracing_saved_tgids_seq_ops); } @@ -5115,8 +5148,11 @@ static const struct seq_operations tracing_saved_cmdlines_seq_ops = { static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; return seq_open(filp, &tracing_saved_cmdlines_seq_ops); } @@ -5280,8 +5316,11 @@ static const struct seq_operations tracing_eval_map_seq_ops = { static int tracing_eval_map_open(struct inode *inode, struct file *filp) { - if (tracing_disabled) - return -ENODEV; + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; return seq_open(filp, &tracing_eval_map_seq_ops); } @@ -5804,13 +5843,11 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; - int ret = 0; - - if (tracing_disabled) - return -ENODEV; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; mutex_lock(&trace_types_lock); @@ -5999,6 +6036,7 @@ waitagain: sizeof(struct trace_iterator) - offsetof(struct trace_iterator, seq)); cpumask_clear(iter->started); + trace_seq_init(&iter->seq); iter->pos = -1; trace_event_read_lock(); @@ -6547,11 +6585,9 @@ static int tracing_clock_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr)) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = single_open(file, tracing_clock_show, inode->i_private); if (ret < 0) @@ -6581,11 +6617,9 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr)) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); if (ret < 0) @@ -6638,10 +6672,11 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; struct trace_iterator *iter; struct seq_file *m; - int ret = 0; + int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, true); @@ -6786,6 +6821,7 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp) struct ftrace_buffer_info *info; int ret; + /* The following checks for tracefs lockdown */ ret = tracing_buffers_open(inode, filp); if (ret < 0) return ret; @@ -7105,8 +7141,9 @@ static int tracing_err_log_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret = 0; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; /* If this file was opened for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -7157,11 +7194,9 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) struct ftrace_buffer_info *info; int ret; - if (tracing_disabled) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f801d154ff6a..d685c61085c0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -338,6 +338,7 @@ extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); extern void trace_array_put(struct trace_array *tr); +extern int tracing_check_open_get_tr(struct trace_array *tr); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); @@ -681,6 +682,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); +int tracing_open_generic_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 80e0b2aca703..2e9a4746ea85 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -178,14 +178,14 @@ static int benchmark_event_kthread(void *arg) int trace_benchmark_reg(void) { if (!ok_to_run) { - pr_warning("trace benchmark cannot be started via kernel command line\n"); + pr_warn("trace benchmark cannot be started via kernel command line\n"); return -EBUSY; } bm_event_thread = kthread_run(benchmark_event_kthread, NULL, "event_benchmark"); if (IS_ERR(bm_event_thread)) { - pr_warning("trace benchmark failed to create kernel thread\n"); + pr_warn("trace benchmark failed to create kernel thread\n"); return PTR_ERR(bm_event_thread); } diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index a41fed46c285..89779eb84a07 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -174,6 +174,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(NULL); if (ret < 0) diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0892e38ed6fb..a9dfa04ffa44 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -272,9 +272,11 @@ int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) goto out; } + mutex_lock(&event_mutex); ret = perf_trace_event_init(tp_event, p_event); if (ret) destroy_local_trace_kprobe(tp_event); + mutex_unlock(&event_mutex); out: kfree(func); return ret; @@ -282,8 +284,10 @@ out: void perf_kprobe_destroy(struct perf_event *p_event) { + mutex_lock(&event_mutex); perf_trace_event_close(p_event); perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); destroy_local_trace_kprobe(p_event->tp_event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b89cdfe20bc1..fba87d10f0c1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) fmt #include <linux/workqueue.h> +#include <linux/security.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/tracefs.h> @@ -1294,6 +1295,8 @@ static int trace_format_open(struct inode *inode, struct file *file) struct seq_file *m; int ret; + /* Do we want to hide event format files on tracefs lockdown? */ + ret = seq_open(file, &trace_format_seq_ops); if (ret < 0) return ret; @@ -1440,28 +1443,17 @@ static int system_tr_open(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; int ret; - if (tracing_is_disabled()) - return -ENODEV; - - if (trace_array_get(tr) < 0) - return -ENODEV; - /* Make a temporary dir that has no system but points to tr */ dir = kzalloc(sizeof(*dir), GFP_KERNEL); - if (!dir) { - trace_array_put(tr); + if (!dir) return -ENOMEM; - } - dir->tr = tr; - - ret = tracing_open_generic(inode, filp); + ret = tracing_open_generic_tr(inode, filp); if (ret < 0) { - trace_array_put(tr); kfree(dir); return ret; } - + dir->tr = tr; filp->private_data = dir; return 0; @@ -1771,6 +1763,10 @@ ftrace_event_open(struct inode *inode, struct file *file, struct seq_file *m; int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = seq_open(file, seq_ops); if (ret < 0) return ret; @@ -1795,6 +1791,7 @@ ftrace_event_avail_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_event_seq_ops; + /* Checks for tracefs lockdown */ return ftrace_event_open(inode, file, seq_ops); } @@ -1805,8 +1802,9 @@ ftrace_event_set_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) @@ -1825,8 +1823,9 @@ ftrace_event_set_pid_open(struct inode *inode, struct file *file) struct trace_array *tr = inode->i_private; int ret; - if (trace_array_get(tr) < 0) - return -ENODEV; + ret = tracing_check_open_get_tr(tr); + if (ret) + return ret; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9468bd8d44a2..7482a1466ebf 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/kallsyms.h> +#include <linux/security.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/stacktrace.h> @@ -678,6 +679,8 @@ static bool synth_field_signed(char *type) { if (str_has_prefix(type, "u")) return false; + if (strcmp(type, "gfp_t") == 0) + return false; return true; } @@ -1448,6 +1451,10 @@ static int synth_events_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(&synth_event_ops); if (ret < 0) @@ -1680,7 +1687,7 @@ static int save_hist_vars(struct hist_trigger_data *hist_data) if (var_data) return 0; - if (trace_array_get(tr) < 0) + if (tracing_check_open_get_tr(tr)) return -ENODEV; var_data = kzalloc(sizeof(*var_data), GFP_KERNEL); @@ -5515,6 +5522,12 @@ static int hist_show(struct seq_file *m, void *v) static int event_hist_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return single_open(file, hist_show, file); } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 2a2912cb4533..2cd53ca21b51 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -5,6 +5,7 @@ * Copyright (C) 2013 Tom Zanussi <tom.zanussi@linux.intel.com> */ +#include <linux/security.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> @@ -173,7 +174,11 @@ static const struct seq_operations event_triggers_seq_ops = { static int event_trigger_regex_open(struct inode *inode, struct file *file) { - int ret = 0; + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; mutex_lock(&event_mutex); @@ -292,6 +297,7 @@ event_trigger_write(struct file *filp, const char __user *ubuf, static int event_trigger_open(struct inode *inode, struct file *filp) { + /* Checks for tracefs lockdown */ return event_trigger_regex_open(inode, filp); } diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index fa95139445b2..862f4b0139fc 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -150,7 +150,7 @@ void trace_hwlat_callback(bool enter) if (enter) nmi_ts_start = time_get(); else - nmi_total_ts = time_get() - nmi_ts_start; + nmi_total_ts += time_get() - nmi_ts_start; } if (enter) @@ -256,6 +256,8 @@ static int get_sample(void) /* Keep a running maximum ever recorded hardware latency */ if (sample > tr->max_latency) tr->max_latency = sample; + if (outer_sample > tr->max_latency) + tr->max_latency = outer_sample; } out: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 324ffbea3556..1552a95c743b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -7,11 +7,11 @@ */ #define pr_fmt(fmt) "trace_kprobe: " fmt +#include <linux/security.h> #include <linux/module.h> #include <linux/uaccess.h> #include <linux/rculist.h> #include <linux/error-injection.h> -#include <linux/security.h> #include <asm/setup.h> /* for COMMAND_LINE_SIZE */ @@ -936,6 +936,10 @@ static int probes_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(&trace_kprobe_ops); if (ret < 0) @@ -988,6 +992,12 @@ static const struct seq_operations profile_seq_op = { static int profile_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &profile_seq_op); } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index c3fd849d4a8f..d4e31e969206 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -6,6 +6,7 @@ * */ #include <linux/seq_file.h> +#include <linux/security.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/ftrace.h> @@ -348,6 +349,12 @@ static const struct seq_operations show_format_seq_ops = { static int ftrace_formats_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &show_format_seq_ops); } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index ec9a34a97129..4df9a209f7ca 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -5,6 +5,7 @@ */ #include <linux/sched/task_stack.h> #include <linux/stacktrace.h> +#include <linux/security.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/spinlock.h> @@ -470,6 +471,12 @@ static const struct seq_operations stack_trace_seq_ops = { static int stack_trace_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &stack_trace_seq_ops); } @@ -487,6 +494,7 @@ stack_trace_filter_open(struct inode *inode, struct file *file) { struct ftrace_ops *ops = inode->i_private; + /* Checks for tracefs lockdown */ return ftrace_regex_open(ops, FTRACE_ITER_FILTER, inode, file); } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 75bf1bcb4a8a..9ab0a1a7ad5e 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -9,7 +9,7 @@ * */ - +#include <linux/security.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/rbtree.h> @@ -238,6 +238,10 @@ static int tracing_stat_open(struct inode *inode, struct file *file) struct seq_file *m; struct stat_session *session = inode->i_private; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = stat_seq_init(session); if (ret) return ret; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index dd884341f5c5..352073d36585 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) "trace_uprobe: " fmt +#include <linux/security.h> #include <linux/ctype.h> #include <linux/module.h> #include <linux/uaccess.h> @@ -769,6 +770,10 @@ static int probes_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { ret = dyn_events_release_all(&trace_uprobe_ops); if (ret) @@ -818,6 +823,12 @@ static const struct seq_operations profile_seq_op = { static int profile_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &profile_seq_op); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc2e09a8ea61..914b845ad4ff 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -248,7 +248,7 @@ struct workqueue_struct { struct list_head flusher_overflow; /* WQ: flush overflow list */ struct list_head maydays; /* MD: pwqs requesting rescue */ - struct worker *rescuer; /* I: rescue worker */ + struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* WQ: saved pwq max_active */ @@ -355,6 +355,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); +static void show_pwq(struct pool_workqueue *pwq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> @@ -425,7 +426,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * ignored. */ #define for_each_pwq(pwq, wq) \ - list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ + list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node, \ + lockdep_is_held(&wq->mutex)) \ if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ else @@ -2532,8 +2534,14 @@ repeat: */ if (need_to_create_worker(pool)) { spin_lock(&wq_mayday_lock); - get_pwq(pwq); - list_move_tail(&pwq->mayday_node, &wq->maydays); + /* + * Queue iff we aren't racing destruction + * and somebody else hasn't queued it already. + */ + if (wq->rescuer && list_empty(&pwq->mayday_node)) { + get_pwq(pwq); + list_add_tail(&pwq->mayday_node, &wq->maydays); + } spin_unlock(&wq_mayday_lock); } } @@ -4314,6 +4322,22 @@ err_destroy: } EXPORT_SYMBOL_GPL(alloc_workqueue); +static bool pwq_busy(struct pool_workqueue *pwq) +{ + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) + if (pwq->nr_in_flight[i]) + return true; + + if ((pwq != pwq->wq->dfl_pwq) && (pwq->refcnt > 1)) + return true; + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + return true; + + return false; +} + /** * destroy_workqueue - safely terminate a workqueue * @wq: target workqueue @@ -4325,31 +4349,51 @@ void destroy_workqueue(struct workqueue_struct *wq) struct pool_workqueue *pwq; int node; + /* + * Remove it from sysfs first so that sanity check failure doesn't + * lead to sysfs name conflicts. + */ + workqueue_sysfs_unregister(wq); + /* drain it before proceeding with destruction */ drain_workqueue(wq); - /* sanity checks */ - mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) { - int i; + /* kill rescuer, if sanity checks fail, leave it w/o rescuer */ + if (wq->rescuer) { + struct worker *rescuer = wq->rescuer; - for (i = 0; i < WORK_NR_COLORS; i++) { - if (WARN_ON(pwq->nr_in_flight[i])) { - mutex_unlock(&wq->mutex); - show_workqueue_state(); - return; - } - } + /* this prevents new queueing */ + spin_lock_irq(&wq_mayday_lock); + wq->rescuer = NULL; + spin_unlock_irq(&wq_mayday_lock); + + /* rescuer will empty maydays list before exiting */ + kthread_stop(rescuer->task); + kfree(rescuer); + } - if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) || - WARN_ON(pwq->nr_active) || - WARN_ON(!list_empty(&pwq->delayed_works))) { + /* + * Sanity checks - grab all the locks so that we wait for all + * in-flight operations which may do put_pwq(). + */ + mutex_lock(&wq_pool_mutex); + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) { + spin_lock_irq(&pwq->pool->lock); + if (WARN_ON(pwq_busy(pwq))) { + pr_warning("%s: %s has the following busy pwq\n", + __func__, wq->name); + show_pwq(pwq); + spin_unlock_irq(&pwq->pool->lock); mutex_unlock(&wq->mutex); + mutex_unlock(&wq_pool_mutex); show_workqueue_state(); return; } + spin_unlock_irq(&pwq->pool->lock); } mutex_unlock(&wq->mutex); + mutex_unlock(&wq_pool_mutex); /* * wq list is used to freeze wq, remove from list after @@ -4359,11 +4403,6 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); - workqueue_sysfs_unregister(wq); - - if (wq->rescuer) - kthread_stop(wq->rescuer->task); - if (!(wq->flags & WQ_UNBOUND)) { wq_unregister_lockdep(wq); /* @@ -4638,7 +4677,8 @@ static void show_pwq(struct pool_workqueue *pwq) pr_info(" pwq %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + pr_cont(" active=%d/%d refcnt=%d%s\n", + pwq->nr_active, pwq->max_active, pwq->refcnt, !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); hash_for_each(pool->busy_hash, bkt, worker, hentry) { @@ -4657,7 +4697,7 @@ static void show_pwq(struct pool_workqueue *pwq) pr_cont("%s %d%s:%ps", comma ? "," : "", task_pid_nr(worker->task), - worker == pwq->wq->rescuer ? "(RESCUER)" : "", + worker->rescue_wq ? "(RESCUER)" : "", worker->current_func); list_for_each_entry(work, &worker->scheduled, entry) pr_cont_work(false, work); |