diff options
Diffstat (limited to 'kernel')
55 files changed, 751 insertions, 253 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235..41751834e764 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ + kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o @@ -122,7 +122,7 @@ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index c565fbf66ac8..b92805b317a2 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -76,17 +76,18 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa struct audit_fsnotify_mark *audit_mark; struct path path; struct dentry *dentry; - struct inode *inode; int ret; if (pathname[0] != '/' || pathname[len-1] == '/') return ERR_PTR(-EINVAL); - dentry = kern_path_locked(pathname, &path); + dentry = kern_path_parent(pathname, &path); if (IS_ERR(dentry)) return ERR_CAST(dentry); /* returning an error */ - inode = path.dentry->d_inode; - inode_unlock(inode); + if (d_really_is_negative(dentry)) { + audit_mark = ERR_PTR(-ENOENT); + goto out; + } audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { @@ -100,7 +101,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); + ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0); if (ret < 0) { audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0ebbbe37a60f..a700e3c8925f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -349,7 +349,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { struct dentry *d; - d = kern_path_locked_negative(watch->path, parent); + d = kern_path_parent(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); @@ -359,7 +359,6 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) watch->ino = d_backing_inode(d)->i_ino; } - inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 269c04a24664..f6cf8c2af5f7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -62,3 +62,4 @@ CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5d1650af899d..e4568d44e827 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2366,8 +2366,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, or interpreter is being used when - * prog->jit_requested is not 0, so warn about it! + * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; @@ -2468,8 +2467,9 @@ out: return ret; } -static void bpf_prog_select_func(struct bpf_prog *fp) +static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { + bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; @@ -2478,15 +2478,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ - if (!fp->jit_requested && - !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; + select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif + return select_interpreter; } /** @@ -2505,7 +2506,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = fp->jit_requested; + bool jit_needed = false; if (fp->bpf_func) goto finalize; @@ -2514,7 +2515,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) bpf_prog_has_kfunc_call(fp)) jit_needed = true; - bpf_prog_select_func(fp); + if (!bpf_prog_select_interpreter(fp)) + jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -3024,7 +3026,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { - .func = NULL, + /* func is unused for tail_call, we set it to pass the + * get_helper_proto check + */ + .func = BPF_PTR_POISON, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b2b7b8ec2c2a..c46360b27871 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, struct xdp_buff xdp; int i, nframes = 0; - xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; for (i = 0; i < n; i++) { @@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } - xdp_clear_return_frame_no_direct(); stats->pass += nframes; return nframes; @@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + xdp_set_return_frame_no_direct(); ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); if (unlikely(ret->skb_n)) @@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (stats->redirect) xdp_do_flush(); + xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 94854cd9c4cc..83c4d9943084 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, siv_len = siv ? __bpf_dynptr_size(siv) : 0; src_len = __bpf_dynptr_size(src); dst_len = __bpf_dynptr_size(dst); - if (!src_len || !dst_len) + if (!src_len || !dst_len || src_len > dst_len) return -EINVAL; if (siv_len != ctx->siv_len) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68..8af62cb243d9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1274,8 +1274,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until + * kmalloc_nolock() is available, avoid locking issues by using + * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). + */ + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; @@ -3664,10 +3667,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { - for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) { + for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&c2, s2__ign + j, char, err_out); if (c2 == '\0') return i; + /* + * We allow reading an extra byte from s2 (note the + * `i + j <= len` above) to cover the case when s2 is + * a suffix of the first len chars of s1. + */ + if (i + j == len) + break; __get_kernel_nofault(&c1, s1__ign + j, char, err_out); if (c1 == '\0') return -ENOENT; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392..f90bdcc0a047 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(path_fd, pathname, &path, 0); + dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, ret = -EPERM; } out: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return ret; } @@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode) const struct super_operations bpf_super_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = bpf_show_options, .free_inode = bpf_free_inode, }; diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 5ab354d55d82..a00561b1d3e5 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -471,7 +471,7 @@ queue: * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= _Q_MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { lockevent_inc(lock_no_node); RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); while (!queued_spin_trylock(lock)) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af6..9fb1f957a093 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8547,6 +8547,10 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); + return -EOPNOTSUPP; + } meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; @@ -11354,7 +11358,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return -EINVAL; *ptr = env->ops->get_func_proto(func_id, env->prog); - return *ptr ? 0 : -EINVAL; + return *ptr && (*ptr)->func ? 0 : -EINVAL; } static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..a0d5d62f1483 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -59,6 +59,7 @@ #include <linux/sched/cputime.h> #include <linux/sched/deadline.h> #include <linux/psi.h> +#include <linux/nstree.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -126,8 +127,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -218,11 +242,12 @@ static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_D /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, - .ns.inum = PROC_CGROUP_INIT_INO, + .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, + .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; @@ -4159,6 +4184,7 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, @@ -5558,7 +5584,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5567,7 +5593,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5701,7 +5727,7 @@ err_list_del: list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5939,7 +5965,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6312,6 +6338,7 @@ int __init cgroup_init(void) WARN_ON(register_filesystem(&cpuset_fs_type)); #endif + ns_tree_add(&init_cgroup_ns); return 0; } @@ -6325,8 +6352,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c6..fdbe57578e68 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -5,7 +5,7 @@ #include <linux/slab.h> #include <linux/nsproxy.h> #include <linux/proc_ns.h> - +#include <linux/nstree.h> /* cgroup namespaces */ @@ -21,33 +21,32 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts) static struct cgroup_namespace *alloc_cgroup_ns(void) { - struct cgroup_namespace *new_ns; + struct cgroup_namespace *new_ns __free(kfree) = NULL; int ret; new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); + ret = ns_common_init(new_ns); + if (ret) return ERR_PTR(ret); - } - refcount_set(&new_ns->ns.count, 1); - new_ns->ns.ops = &cgroupns_operations; - return new_ns; + ns_tree_add(new_ns); + return no_free_ptr(new_ns); } void free_cgroup_ns(struct cgroup_namespace *ns) { + ns_tree_remove(ns); put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kfree(ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { @@ -90,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -143,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 64caaf997fc0..7c3924614e01 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -93,8 +93,8 @@ CONFIG_SECCOMP_FILTER=y # Provides some protections against SYN flooding. CONFIG_SYN_COOKIES=y -# Enable Kernel Control Flow Integrity (currently Clang only). -CONFIG_CFI_CLANG=y +# Enable Kernel Control Flow Integrity. +CONFIG_CFI=y # CONFIG_CFI_PERMISSIVE is not set # Attack surface reduction: do not autoload TTY line disciplines. diff --git a/kernel/cred.c b/kernel/cred.c index 9676965c0981..dbf6b687dc5c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -287,7 +287,7 @@ struct cred *prepare_exec_creds(void) * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ -int copy_creds(struct task_struct *p, unsigned long clone_flags) +int copy_creds(struct task_struct *p, u64 clone_flags) { struct cred *new; int ret; diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index e43c6de2bce4..b82399437db0 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -39,6 +39,7 @@ enum { dma_debug_sg, dma_debug_coherent, dma_debug_resource, + dma_debug_noncoherent, }; enum map_err_types { @@ -141,6 +142,7 @@ static const char *type2name[] = { [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", [dma_debug_resource] = "resource", + [dma_debug_noncoherent] = "noncoherent", }; static const char *dir2name[] = { @@ -993,7 +995,8 @@ static void check_unmap(struct dma_debug_entry *ref) "[mapped as %s] [unmapped as %s]\n", ref->dev_addr, ref->size, type2name[entry->type], type2name[ref->type]); - } else if (entry->type == dma_debug_coherent && + } else if ((entry->type == dma_debug_coherent || + entry->type == dma_debug_noncoherent) && ref->paddr != entry->paddr) { err_printk(ref->dev, entry, "device driver frees " "DMA memory with different CPU address " @@ -1581,6 +1584,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } } +void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_noncoherent; + entry->dev = dev; + entry->paddr = page_to_phys(page); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + + add_dma_entry(entry, attrs); +} + +void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_noncoherent, + .dev = dev, + .paddr = page_to_phys(page), + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} + static int __init dma_debug_driver_setup(char *str) { int i; diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index f525197d3cae..48757ca13f31 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -54,6 +54,13 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev, extern void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction); +extern void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs); +extern void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, @@ -126,5 +133,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev, int nelems, int direction) { } + +static inline void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ +} + +static inline void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} #endif /* CONFIG_DMA_API_DEBUG */ #endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 107e4a4d251d..56de28a3b179 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -712,7 +712,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, if (page) { trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, size, dir, gfp, 0); - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); + debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0); } else { trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } @@ -738,7 +738,7 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); - debug_dma_unmap_page(dev, dma_handle, size, dir); + debug_dma_free_pages(dev, page, size, dir, dma_handle); __dma_free_pages(dev, size, page, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_pages); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 408d28b5179d..f62e1d1b2063 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -143,6 +143,20 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } +/** + * arch_irqentry_exit_need_resched - Architecture specific need resched function + * + * Invoked from raw_irqentry_exit_cond_resched() to check if resched is needed. + * Defaults return true. + * + * The main purpose is to permit arch to avoid preemption of a task from an IRQ. + */ +static inline bool arch_irqentry_exit_need_resched(void); + +#ifndef arch_irqentry_exit_need_resched +static inline bool arch_irqentry_exit_need_resched(void) { return true; } +#endif + void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { @@ -150,7 +164,7 @@ void raw_irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (need_resched() && arch_irqentry_exit_need_resched()) preempt_schedule_irq(); } } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd..6d3034ec418d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -121,7 +121,7 @@ struct xol_area { static void uprobe_warn(struct task_struct *t, const char *msg) { - pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); + pr_warn("uprobe: %s:%d failed to %s\n", t->comm, t->pid, msg); } /* @@ -2160,7 +2160,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t, unsigned long flags) +void uprobe_copy_process(struct task_struct *t, u64 flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; diff --git a/kernel/fork.c b/kernel/fork.c index c4ada32598bd..f24f4c71d002 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1507,7 +1507,7 @@ fail_nomem: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) +static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; @@ -1545,7 +1545,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { @@ -1566,7 +1566,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk, +static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; @@ -1596,7 +1596,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, return 0; } -static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) +static int copy_sighand(u64 clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -1645,7 +1645,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) posix_cputimers_group_init(pct, cpu_limit); } -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; @@ -2295,7 +2295,7 @@ __latent_entropy struct task_struct *copy_process( if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) - goto bad_fork_core_free; + goto bad_fork_cancel_cgroup; /* * If we fail beyond this point we don't free the allocated * futex hash map. We assume that another thread will be created diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index c716a66f8692..d818b4d47f1b 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -230,8 +230,9 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - q->key = *key; + struct task_struct *task; + q->key = *key; __futex_unqueue(q); WARN_ON(!q->rt_waiter); @@ -243,10 +244,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, futex_hash_get(hb); q->drop_hb_ref = true; q->lock_ptr = &hb->lock; + task = READ_ONCE(q->task); /* Signal locked state to the waiter */ futex_requeue_pi_complete(q, 1); - wake_up_state(q->task, TASK_NORMAL); + wake_up_state(task, TASK_NORMAL); } /** diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9b09ad3f9914..e7ad99254841 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1644,9 +1644,6 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) else __msi_domain_free_irqs(dev, domain, ctrl); - if (ops->msi_post_free) - ops->msi_post_free(domain, dev); - if (info->flags & MSI_FLAG_FREE_MSI_DESCS) msi_domain_free_descs(dev, ctrl); } diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 39278737bb68..2a1beebf1d37 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -460,6 +460,6 @@ config UNUSED_KSYMS_WHITELIST config MODULES_TREE_LOOKUP def_bool y - depends on PERF_EVENTS || TRACING || CFI_CLANG + depends on PERF_EVENTS || TRACING || CFI endif # MODULES diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index d3204c5c74eb..f8e8c126705c 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -14,7 +14,7 @@ * Use a latched RB-tree for __module_address(); this allows us to use * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * This is conditional on PERF_EVENTS || TRACING || CFI because those can * really hit __module_address() hard by doing a lot of stack unwinding; * potentially from NMI context. */ diff --git a/kernel/nscommon.c b/kernel/nscommon.c new file mode 100644 index 000000000000..c1fb2bad6d72 --- /dev/null +++ b/kernel/nscommon.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ns_common.h> +#include <linux/proc_ns.h> +#include <linux/vfsdebug.h> + +#ifdef CONFIG_DEBUG_VFS +static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) +{ + switch (ns->ns_type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + VFS_WARN_ON_ONCE(ops != &cgroupns_operations); + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + VFS_WARN_ON_ONCE(ops != &ipcns_operations); + break; +#endif + case CLONE_NEWNS: + VFS_WARN_ON_ONCE(ops != &mntns_operations); + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + VFS_WARN_ON_ONCE(ops != &netns_operations); + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + VFS_WARN_ON_ONCE(ops != &pidns_operations); + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + VFS_WARN_ON_ONCE(ops != &timens_operations); + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + VFS_WARN_ON_ONCE(ops != &userns_operations); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + VFS_WARN_ON_ONCE(ops != &utsns_operations); + break; +#endif + } +} +#endif + +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) +{ + refcount_set(&ns->__ns_ref, 1); + ns->stashed = NULL; + ns->ops = ops; + ns->ns_id = 0; + ns->ns_type = ns_type; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); + +#ifdef CONFIG_DEBUG_VFS + ns_debug(ns, ops); +#endif + + if (inum) { + ns->inum = inum; + return 0; + } + return proc_alloc_inum(&ns->inum); +} + +void __ns_common_free(struct ns_common *ns) +{ + proc_free_inum(ns->inum); +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38..19aa64ab08c8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -64,7 +64,7 @@ static inline struct nsproxy *create_nsproxy(void) * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(unsigned long flags, +static struct nsproxy *create_new_namespaces(u64 flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs) { @@ -144,7 +144,7 @@ out_ns: * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) +int copy_namespaces(u64 flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); @@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(fd_file(f))) { ns = get_proc_ns(file_inode(fd_file(f))); - if (flags && (ns->ops->type != flags)) + if (flags && (ns->ns_type != flags)) err = -EINVAL; - flags = ns->ops->type; + flags = ns->ns_type; } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { err = check_setns_flags(flags); } else { diff --git a/kernel/nstree.c b/kernel/nstree.c new file mode 100644 index 000000000000..b24a320a11a6 --- /dev/null +++ b/kernel/nstree.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/nstree.h> +#include <linux/proc_ns.h> +#include <linux/vfsdebug.h> + +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + * @type: type of namespaces in this tree + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + +struct ns_tree mnt_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), + .type = CLONE_NEWNS, +}; + +struct ns_tree net_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), + .type = CLONE_NEWNET, +}; +EXPORT_SYMBOL_GPL(net_ns_tree); + +struct ns_tree uts_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), + .type = CLONE_NEWUTS, +}; + +struct ns_tree user_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), + .type = CLONE_NEWUSER, +}; + +struct ns_tree ipc_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), + .type = CLONE_NEWIPC, +}; + +struct ns_tree pid_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), + .type = CLONE_NEWPID, +}; + +struct ns_tree cgroup_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), + .type = CLONE_NEWCGROUP, +}; + +struct ns_tree time_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), + .type = CLONE_NEWTIME, +}; + +DEFINE_COOKIE(namespace_cookie); + +static inline struct ns_common *node_to_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_tree_node); +} + +static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct ns_common *ns_a = node_to_ns(a); + struct ns_common *ns_b = node_to_ns(b); + u64 ns_id_a = ns_a->ns_id; + u64 ns_id_b = ns_b->ns_id; + + if (ns_id_a < ns_id_b) + return -1; + if (ns_id_a > ns_id_b) + return 1; + return 0; +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +{ + struct rb_node *node, *prev; + + VFS_WARN_ON_ONCE(!ns->ns_id); + + write_seqlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + + node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->ns_tree_node); + if (!prev) + list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); + else + list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + + write_sequnlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(node); +} + +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +{ + VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + + write_seqlock(&ns_tree->ns_tree_lock); + rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); + list_bidir_del_rcu(&ns->ns_list_node); + RB_CLEAR_NODE(&ns->ns_tree_node); + write_sequnlock(&ns_tree->ns_tree_lock); +} +EXPORT_SYMBOL_GPL(__ns_tree_remove); + +static int ns_find(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns(node); + + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + + +static struct ns_tree *ns_tree_from_type(int ns_type) +{ + switch (ns_type) { + case CLONE_NEWCGROUP: + return &cgroup_ns_tree; + case CLONE_NEWIPC: + return &ipc_ns_tree; + case CLONE_NEWNS: + return &mnt_ns_tree; + case CLONE_NEWNET: + return &net_ns_tree; + case CLONE_NEWPID: + return &pid_ns_tree; + case CLONE_NEWUSER: + return &user_ns_tree; + case CLONE_NEWUTS: + return &uts_ns_tree; + case CLONE_NEWTIME: + return &time_ns_tree; + } + + return NULL; +} + +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree *ns_tree; + struct rb_node *node; + unsigned int seq; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + + do { + seq = read_seqbegin(&ns_tree->ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + if (node) + break; + } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + + if (!node) + return NULL; + + VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); + + return node_to_ns(node); +} + +/** + * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * tree + * @ns: namespace to start from + * @previous: if true find the previous namespace, otherwise the next + * + * Find the next or previous namespace in the same tree as @ns. If + * there is no next/previous namespace, -ENOENT is returned. + */ +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, bool previous) +{ + struct list_head *list; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); + + if (previous) + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + else + list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); + if (list_is_head(list, &ns_tree->ns_list)) + return ERR_PTR(-ENOENT); + + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); + + return list_entry_rcu(list, struct ns_common, ns_list_node); +} + +/** + * ns_tree_gen_id - generate a new namespace id + * @ns: namespace to generate id for + * + * Generates a new namespace id and assigns it to the namespace. All + * namespaces types share the same id space and thus can be compared + * directly. IOW, when two ids of two namespace are equal, they are + * identical. + */ +u64 ns_tree_gen_id(struct ns_common *ns) +{ + guard(preempt)(); + ns->ns_id = gen_cookie_next(&namespace_cookie); + return ns->ns_id; +} diff --git a/kernel/pid.c b/kernel/pid.c index c45a28c16cd2..4fffec767a63 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,13 +71,13 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = PROC_PID_INIT_INO, + .ns.inum = ns_init_inum(&init_pid_ns), #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif @@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = { #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif + .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -491,7 +492,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) struct upid *upid; pid_t nr = 0; - if (pid && ns->level <= pid->level) { + if (pid && ns && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; @@ -514,7 +515,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (ns) + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; @@ -680,7 +682,7 @@ static int pid_table_root_permissions(struct ctl_table_header *head, container_of(head->set, struct pid_namespace, set); int mode = table->mode; - if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) || + if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) || uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) mode = (mode & S_IRWXU) >> 6; else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..650be58d8d18 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@ #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/idr.h> +#include <linux/nstree.h> #include <uapi/linux/wait.h> #include "pid_sysctl.h" @@ -102,17 +103,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto out_free_idr; - ns->ns.ops = &pidns_operations; ns->pid_max = PID_MAX_LIMIT; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; - refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); @@ -124,10 +123,11 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif + ns_tree_add(ns); return ns; out_free_inum: - ns_free_inum(&ns->ns); + ns_common_free(ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -149,9 +149,10 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { + ns_tree_remove(ns); unregister_pidns_sysctls(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); @@ -168,10 +169,10 @@ static void destroy_pid_namespace_work(struct work_struct *work) parent = ns->parent; destroy_pid_namespace(ns); ns = parent; - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); + } while (ns != &init_pid_ns && ns_ref_put(ns)); } -struct pid_namespace *copy_pid_ns(unsigned long flags, +struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) @@ -183,7 +184,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns != &init_pid_ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); @@ -344,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; @@ -390,11 +386,23 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } +bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + struct pid_namespace *ns; + + if (child->level < ancestor->level) + return false; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return ns == ancestor; +} + static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = to_pid_ns(ns); + struct pid_namespace *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) @@ -408,13 +416,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns) * this maintains the property that processes and their * children can not escape their current pid namespace. */ - if (new->level < active->level) - return -EINVAL; - - ancestor = new; - while (ancestor->level > active->level) - ancestor = ancestor->parent; - if (ancestor != active) + if (!pidns_is_ancestor(new, active)) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); @@ -447,7 +449,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns) const struct proc_ns_operations pidns_operations = { .name = "pid", - .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, @@ -458,7 +459,6 @@ const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", - .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, @@ -475,6 +475,7 @@ static __init int pid_namespaces_init(void) #endif register_pid_ns_sysctl_table_vm(); + ns_tree_add(&init_pid_ns); return 0; } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index ea7995a25780..8df55397414a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -553,6 +553,30 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts) { + int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts); + + if (_is_cpu_device(dev)) + em_check_capacity_update(); + + return ret; +} +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_register_pd_no_update() - Register a perf domain for a device + * @dev : Device to register the PD for + * @nr_states : Number of performance states in the new PD + * @cb : Callback functions for populating the energy model + * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device) + * @microwatts : Whether or not the power values in the EM will be in uW + * + * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity + * update after registering the PD, even if @dev is a CPU device. + */ +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) +{ struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; unsigned long flags = 0; @@ -636,12 +660,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); - if (_is_cpu_device(dev)) - em_check_capacity_update(); - return ret; } -EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); +EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); /** * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f1f30cca573..2f66ab453823 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -449,6 +449,7 @@ int hibernation_snapshot(int platform_mode) shrink_shmem_memory(); console_suspend_all(); + pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..ec33b0353027 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4472,7 +4472,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * __sched_fork() is basic setup which is also used by sched_init() to * initialize the boot CPU's idle task. */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +static void __sched_fork(u64 clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -4707,7 +4707,7 @@ late_initcall(sched_core_sysctl_init); /* * fork()/clone()-time setup: */ -int sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(u64 clone_flags, struct task_struct *p) { __sched_fork(clone_flags, p); /* @@ -9551,7 +9551,7 @@ static unsigned long tg_weight(struct task_group *tg) #ifdef CONFIG_FAIR_GROUP_SCHED return scale_load_down(tg->shares); #else - return sched_weight_from_cgroup(tg->scx_weight); + return sched_weight_from_cgroup(tg->scx.weight); #endif } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f25301267e47..72c1f72463c7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_se->dl_defer && !dl_se->dl_defer_running && dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { - if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + if (!is_dl_boosted(dl_se)) { /* * Set dl_se->dl_defer_armed and dl_throttled variables to @@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; -static bool dl_server_stopped(struct sched_dl_entity *dl_se); - static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) { struct rq *rq = rq_of_dl_se(dl_se); @@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ if (!dl_se->dl_runtime) return HRTIMER_NORESTART; - if (!dl_se->server_has_tasks(dl_se)) { - replenish_dl_entity(dl_se); - dl_server_stopped(dl_se); - return HRTIMER_NORESTART; - } - if (dl_se->dl_defer_armed) { /* * First check if the server could consume runtime in background. @@ -1579,10 +1571,8 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) { - dl_se->dl_server_idle = 0; + if (dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); - } } void dl_server_start(struct sched_dl_entity *dl_se) @@ -1610,26 +1600,10 @@ void dl_server_stop(struct sched_dl_entity *dl_se) dl_se->dl_server_active = 0; } -static bool dl_server_stopped(struct sched_dl_entity *dl_se) -{ - if (!dl_se->dl_server_active) - return true; - - if (dl_se->dl_server_idle) { - dl_server_stop(dl_se); - return true; - } - - dl_se->dl_server_idle = 1; - return false; -} - void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) { dl_se->rq = rq; - dl_se->server_has_tasks = has_tasks; dl_se->server_pick_task = pick_task; } @@ -2394,10 +2368,7 @@ again: if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - if (!dl_server_stopped(dl_se)) { - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); - } + dl_server_stop(dl_se); goto again; } rq->dl_server = dl_se; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4ae32ef179dd..088ceff38c8a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6788,12 +6788,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. - * - * Also skip re-enqueueing tasks that can only run on this - * CPU, as they would just be re-added to the same local - * DSQ without any benefit. */ - if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + if (p->migration_pending) continue; dispatch_dequeue(rq, p); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 7174e1c1a392..537c6992bb63 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -856,6 +856,32 @@ static bool check_builtin_idle_enabled(void) return false; } +/* + * Determine whether @p is a migration-disabled task in the context of BPF + * code. + * + * We can't simply check whether @p->migration_disabled is set in a + * sched_ext callback, because migration is always disabled for the current + * task while running BPF code. + * + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively + * disable and re-enable migration. For this reason, the current task + * inside a sched_ext callback is always a migration-disabled task. + * + * Therefore, when @p->migration_disabled == 1, check whether @p is the + * current task or not: if it is, then migration was not disabled before + * entering the callback, otherwise migration was disabled. + * + * Returns true if @p is migration-disabled, false otherwise. + */ +static bool is_bpf_migration_disabled(const struct task_struct *p) +{ + if (p->migration_disabled == 1) + return p != current; + else + return p->migration_disabled; +} + static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *allowed, u64 flags) { @@ -898,7 +924,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f * selection optimizations and simply check whether the previously * used CPU is idle and within the allowed cpumask. */ - if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) { + if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) { if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && scx_idle_test_and_clear_cpu(prev_cpu)) cpu = prev_cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..426287930497 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3542,7 +3542,7 @@ out: } } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +void init_numa_balancing(u64 clone_flags, struct task_struct *p) { int mm_users = 0; struct mm_struct *mm = p->mm; @@ -8859,11 +8859,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru return pick_next_task_fair(rq, prev, NULL); } -static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) -{ - return !!dl_se->rq->cfs.nr_queued; -} - static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) { return pick_task_fair(dl_se->rq); @@ -8875,7 +8870,7 @@ void fair_server_init(struct rq *rq) init_dl_entity(dl_se); - dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); + dl_server_init(dl_se, rq, fair_server_pick_task); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f7..69458b9b46dc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -365,25 +365,50 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * * dl_se::rq -- runqueue we belong to. * - * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the - * server when it runs out of tasks to run. - * * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this * returns NULL. * * dl_server_update() -- called from update_curr_common(), propagates runtime * to the server. * - * dl_server_start() - * dl_server_stop() -- start/stop the server when it has (no) tasks. + * dl_server_start() -- start the server when it has tasks; it will stop + * automatically when there are no more tasks, per + * dl_se::server_pick() returning NULL. + * + * dl_server_stop() -- (force) stop the server; use when updating + * parameters. * * dl_server_init() -- initializes the server. + * + * When started the dl_server will (per dl_defer) schedule a timer for its + * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a + * server will run at the very end of its period. + * + * This is done such that any runtime from the target class can be accounted + * against the server -- through dl_server_update() above -- such that when it + * becomes time to run, it might already be out of runtime and get deferred + * until the next period. In this case dl_server_timer() will alternate + * between defer and replenish but never actually enqueue the server. + * + * Only when the target class does not manage to exhaust the server's runtime + * (there's actualy starvation in the given period), will the dl_server get on + * the runqueue. Once queued it will pick tasks from the target class and run + * them until either its runtime is exhaused, at which point its back to + * dl_server_timer, or until there are no more tasks to run, at which point + * the dl_server stops itself. + * + * By stopping at this point the dl_server retains bandwidth, which, if a new + * task wakes up imminently (starting the server again), can be used -- + * subject to CBS wakeup rules -- without having to wait for the next period. + * + * Additionally, because of the dl_defer behaviour the start/stop behaviour is + * naturally thottled to once per period, avoiding high context switch + * workloads from spamming the hrtimer program/cancel paths. */ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); extern void sched_init_dl_servers(void); @@ -1935,12 +1960,12 @@ extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); -extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +extern void init_numa_balancing(u64 clone_flags, struct task_struct *p); #else /* !CONFIG_NUMA_BALANCING: */ static inline void -init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +init_numa_balancing(u64 clone_flags, struct task_struct *p) { } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 41aa761c7738..3bbfba30a777 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1139,7 +1139,7 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn static bool should_sleep_killable(struct seccomp_filter *match, struct seccomp_knotif *n) { - return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT; + return match->wait_killable_recv && n->state >= SECCOMP_NOTIFY_SENT; } static int seccomp_do_user_notification(int this_syscall, @@ -1186,13 +1186,11 @@ static int seccomp_do_user_notification(int this_syscall, if (err != 0) { /* - * Check to see if the notifcation got picked up and - * whether we should switch to wait killable. + * Check to see whether we should switch to wait + * killable. Only return the interrupted error if not. */ - if (!wait_killable && should_sleep_killable(match, &n)) - continue; - - goto interrupted; + if (!(!wait_killable && should_sleep_killable(match, &n))) + goto interrupted; } addfd = list_first_entry_or_null(&n.addfd, diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c..e8c479329282 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -787,10 +787,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -2295,11 +2295,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 667452768ed3..5b6997f4dc3d 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> +#include <linux/nstree.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/cred.h> @@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; - refcount_set(&ns->ns.count, 1); - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!ns->vvar_page) goto fail_free; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free_page; ns->ucounts = ucounts; - ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: @@ -130,7 +129,7 @@ fail: * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) @@ -253,16 +252,13 @@ out: void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); - kfree(ns); -} - -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) @@ -466,7 +462,6 @@ out: const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -476,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -484,9 +478,15 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns.ns_type = ns_common_type(&init_time_ns), + .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, - .ns.inum = PROC_TIME_INIT_INO, + .ns.inum = ns_init_inum(&init_time_ns), .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 2a42c1036ea8..484ad7a18463 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -815,6 +815,7 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe unsigned long bitmap; unsigned long ret; int offset; + int bit; int i; ret_stack = ftrace_pop_return_trace(&trace, &ret, frame_pointer, &offset); @@ -829,6 +830,15 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe if (fregs) ftrace_regs_set_instruction_pointer(fregs, ret); + bit = ftrace_test_recursion_trylock(trace.func, ret); + /* + * This can fail because ftrace_test_recursion_trylock() allows one nest + * call. If we are already in a nested call, then we don't probe this and + * just return the original return address. + */ + if (unlikely(bit < 0)) + goto out; + #ifdef CONFIG_FUNCTION_GRAPH_RETVAL trace.retval = ftrace_regs_get_return_value(fregs); #endif @@ -852,6 +862,8 @@ __ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointe } } + ftrace_test_recursion_unlock(bit); +out: /* * The ftrace_graph_return() may still access the current * ret_stack structure, we need to make sure the update of @@ -1397,7 +1409,8 @@ error: ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); - unregister_pm_notifier(&ftrace_suspend_notifier); + if (!ftrace_graph_active) + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index c8034dfc1070..5a807d62e76d 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -428,8 +428,9 @@ static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long ad { unsigned long *addrs; - if (alist->index >= alist->size) - return -ENOMEM; + /* Previously we failed to expand the list. */ + if (alist->index == alist->size) + return -ENOSPC; alist->addrs[alist->index++] = addr; if (alist->index < alist->size) @@ -489,7 +490,7 @@ static int fprobe_module_callback(struct notifier_block *nb, for (i = 0; i < FPROBE_IP_TABLE_SIZE; i++) fprobe_remove_node_in_module(mod, &fprobe_ip_table[i], &alist); - if (alist.index < alist.size && alist.index > 0) + if (alist.index > 0) ftrace_set_filter_ips(&fprobe_graph_ops.ops, alist.addrs, alist.index, 1, 0); mutex_unlock(&fprobe_mutex); diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c index eea447b06907..c1347da69e9d 100644 --- a/kernel/trace/rv/monitors/sleep/sleep.c +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -127,7 +127,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) mon = ltl_get_monitor(current); switch (id) { +#ifdef __NR_clock_nanosleep case __NR_clock_nanosleep: +#endif #ifdef __NR_clock_nanosleep_time64 case __NR_clock_nanosleep_time64: #endif @@ -138,7 +140,9 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id) ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); break; +#ifdef __NR_futex case __NR_futex: +#endif #ifdef __NR_futex_time64 case __NR_futex_time64: #endif diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 1482e91c39f4..48338520376f 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -495,7 +495,7 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor *mon = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); (*pos)++; @@ -805,7 +805,7 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) retval = create_monitor_dir(monitor, parent); if (retval) - return retval; + goto out_unlock; /* keep children close to the parent for easier visualisation */ if (parent) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b7db732c0b1..b3c94fbaf002 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -834,7 +834,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { - trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } @@ -871,6 +874,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_clear(&parser); ret = 0; } + out: trace_parser_put(&parser); if (ret < 0) { @@ -7209,7 +7213,7 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user entry = ring_buffer_event_data(event); entry->ip = ip; - len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + len = copy_from_user_nofault(&entry->buf, ubuf, cnt); if (len) { memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; @@ -7306,7 +7310,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, entry = ring_buffer_event_data(event); - len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + len = copy_from_user_nofault(&entry->id, ubuf, cnt); if (len) { entry->id = -1; memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 5d64a18cacac..d06854bd32b3 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -230,6 +230,10 @@ static int dyn_event_open(struct inode *inode, struct file *file) { int ret; + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + ret = tracing_check_open_get_tr(NULL); if (ret) return ret; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index af42aaa3d172..2ab283fd3032 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm, { struct user_event_enabler_fault *fault; - fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN); + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT); if (!fault) return false; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ccae62d4fb91..fa60362a3f31 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -908,6 +908,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], return -EINVAL; } buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index fd259da0aa64..dc734867f0fc 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2322,12 +2322,16 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, int running, err; char *buf __free(kfree) = NULL; - buf = kmalloc(count, GFP_KERNEL); + if (count < 1) + return 0; + + buf = kmalloc(count + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_user(buf, ubuf, count)) return -EFAULT; + buf[count] = '\0'; if (!zalloc_cpumask_var(&osnoise_cpumask_new, GFP_KERNEL)) return -ENOMEM; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 16b283f9d831..6ea2f6363b90 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -57,12 +57,11 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_sched = tsk->policy; stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); stats->ac_tgid = task_tgid_nr_ns(tsk, pid_ns); + stats->ac_ppid = task_ppid_nr_ns(tsk, pid_ns); rcu_read_lock(); tcred = __task_cred(tsk); stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); - stats->ac_ppid = pid_alive(tsk) ? - task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; rcu_read_unlock(); task_cputime(tsk, &utime, &stime); diff --git a/kernel/user.c b/kernel/user.c index f46b1d41163b..0163665914c9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,10 +65,11 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.count = REFCOUNT_INIT(3), + .ns.ns_type = ns_common_type(&init_user_ns), + .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .ns.inum = PROC_USER_INIT_INO, + .ns.inum = ns_init_inum(&init_user_ns), #ifdef CONFIG_USER_NS .ns.ops = &userns_operations, #endif diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 682f40d5632d..03cb63883d04 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,7 @@ #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/nstree.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); @@ -124,12 +125,11 @@ int create_user_ns(struct cred *new) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_alloc_inum(&ns->ns); + + ret = ns_common_init(ns); if (ret) goto fail_free; - ns->ns.ops = &userns_operations; - refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; @@ -159,12 +159,13 @@ int create_user_ns(struct cred *new) goto fail_keyring; set_cred_user_ns(new, ns); + ns_tree_add(ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: @@ -201,6 +202,7 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + ns_tree_remove(ns); if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); @@ -218,11 +220,12 @@ static void free_user_ns(struct work_struct *work) #endif retire_userns_sysctls(ns); key_free_user_ns(ns); - ns_free_inum(&ns->ns); - kmem_cache_free(user_ns_cachep, ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; - } while (refcount_dec_and_test(&parent->ns.count)); + } while (ns_ref_put(parent)); } void __put_user_ns(struct user_namespace *ns) @@ -1322,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns) } EXPORT_SYMBOL(current_in_userns); -static inline struct user_namespace *to_user_ns(struct ns_common *ns) -{ - return container_of(ns, struct user_namespace, ns); -} - static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; @@ -1402,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns) const struct proc_ns_operations userns_operations = { .name = "user", - .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, @@ -1413,6 +1410,7 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); + ns_tree_add(&init_user_ns); return 0; } subsys_initcall(user_namespaces_init); diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f2..ebbfc578a9d3 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -13,6 +13,7 @@ #include <linux/cred.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> +#include <linux/nstree.h> #include <linux/sched/task.h> static struct kmem_cache *uts_ns_cache __ro_after_init; @@ -27,16 +28,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts) dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); } -static struct uts_namespace *create_uts_ns(void) -{ - struct uts_namespace *uts_ns; - - uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); - if (uts_ns) - refcount_set(&uts_ns->ns.count, 1); - return uts_ns; -} - /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone @@ -55,21 +46,20 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = create_uts_ns(); + ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL); if (!ns) goto fail_dec; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free; ns->ucounts = ucounts; - ns->ns.ops = &utsns_operations; - down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); + ns_tree_add(ns); return ns; fail_free: @@ -86,7 +76,7 @@ fail: * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, +struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; @@ -105,15 +95,12 @@ struct uts_namespace *copy_utsname(unsigned long flags, void free_uts_ns(struct uts_namespace *ns) { + ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - kmem_cache_free(uts_ns_cache, ns); -} - -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); + ns_common_free(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *utsns_get(struct task_struct *task) @@ -159,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns) const struct proc_ns_operations utsns_operations = { .name = "uts", - .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, @@ -174,4 +160,5 @@ void __init uts_ns_init(void) offsetof(struct uts_namespace, name), sizeof_field(struct uts_namespace, name), NULL); + ns_tree_add(&init_uts_ns); } diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index bc738fa90c1d..27107dcc1cbf 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -100,6 +100,7 @@ void vhost_task_stop(struct vhost_task *vtsk) * freeing it below. */ wait_for_completion(&vtsk->exited); + put_task_struct(vtsk->task); kfree(vtsk); } EXPORT_SYMBOL_GPL(vhost_task_stop); @@ -148,7 +149,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), return ERR_CAST(tsk); } - vtsk->task = tsk; + vtsk->task = get_task_struct(tsk); return vtsk; } EXPORT_SYMBOL_GPL(vhost_task_create); |