diff options
Diffstat (limited to 'kernel')
99 files changed, 3689 insertions, 2623 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 6a95a6077953..13d0144efaa3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1221,8 +1221,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; struct audit_sig_info *sig_data; - char *ctx = NULL; - u32 len; + struct lsm_context lsmctx; err = audit_netlink_ok(skb, msg_type); if (err) @@ -1472,27 +1471,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, break; } case AUDIT_SIGNAL_INFO: - len = 0; if (lsmprop_is_set(&audit_sig_lsm)) { - err = security_lsmprop_to_secctx(&audit_sig_lsm, &ctx, - &len); - if (err) + err = security_lsmprop_to_secctx(&audit_sig_lsm, + &lsmctx); + if (err < 0) return err; } - sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL); + sig_data = kmalloc(struct_size(sig_data, ctx, lsmctx.len), + GFP_KERNEL); if (!sig_data) { if (lsmprop_is_set(&audit_sig_lsm)) - security_release_secctx(ctx, len); + security_release_secctx(&lsmctx); return -ENOMEM; } sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); sig_data->pid = audit_sig_pid; if (lsmprop_is_set(&audit_sig_lsm)) { - memcpy(sig_data->ctx, ctx, len); - security_release_secctx(ctx, len); + memcpy(sig_data->ctx, lsmctx.context, lsmctx.len); + security_release_secctx(&lsmctx); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, - sig_data, struct_size(sig_data, ctx, len)); + sig_data, struct_size(sig_data, ctx, + lsmctx.len)); kfree(sig_data); break; case AUDIT_TTY_GET: { @@ -2180,23 +2180,22 @@ void audit_log_key(struct audit_buffer *ab, char *key) int audit_log_task_context(struct audit_buffer *ab) { struct lsm_prop prop; - char *ctx = NULL; - unsigned len; + struct lsm_context ctx; int error; security_current_getlsmprop_subj(&prop); if (!lsmprop_is_set(&prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx, &len); - if (error) { + error = security_lsmprop_to_secctx(&prop, &ctx); + if (error < 0) { if (error != -EINVAL) goto error_path; return 0; } - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; error_path: diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index bceb9f58a09e..e3f42018ed46 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1319,13 +1319,20 @@ int audit_compare_dname_path(const struct qstr *dname, const char *path, int par if (pathlen < dlen) return 1; - parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; - if (pathlen - parentlen != dlen) - return 1; + if (parentlen == AUDIT_NAME_FULL) + parentlen = parent_len(path); p = path + parentlen; - return strncmp(p, dname->name, dlen); + /* handle trailing slashes */ + pathlen -= parentlen; + while (p[pathlen - 1] == '/') + pathlen--; + + if (pathlen != dlen) + return 1; + + return memcmp(p, dname->name, dlen); } int audit_filter(int msgtype, unsigned int listtype) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 561d96affe9f..9c853cde9abe 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1098,8 +1098,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, char *comm) { struct audit_buffer *ab; - char *ctx = NULL; - u32 len; + struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); @@ -1110,12 +1109,12 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx, &len)) { + if (security_lsmprop_to_secctx(prop, &ctx) < 0) { audit_log_format(ab, " obj=(none)"); rc = 1; } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); } } audit_log_format(ab, " ocomm="); @@ -1393,15 +1392,14 @@ static void show_special(struct audit_context *context, int *call_panic) from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { - char *ctx = NULL; - u32 len; + struct lsm_context lsmctx; if (security_lsmprop_to_secctx(&context->ipc.oprop, - &ctx, &len)) { + &lsmctx) < 0) { *call_panic = 1; } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); + audit_log_format(ab, " obj=%s", lsmctx.context); + security_release_secctx(&lsmctx); } } if (context->ipc.has_perm) { @@ -1560,15 +1558,14 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, MAJOR(n->rdev), MINOR(n->rdev)); if (lsmprop_is_set(&n->oprop)) { - char *ctx = NULL; - u32 len; + struct lsm_context ctx; - if (security_lsmprop_to_secctx(&n->oprop, &ctx, &len)) { + if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) { if (call_panic) *call_panic = 2; } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); } } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a2f46785ac3b..774accbd4a22 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -190,7 +190,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, int err; rxq.dev = xdpf->dev_rx; - rxq.mem = xdpf->mem; + rxq.mem.type = xdpf->mem_type; /* TODO: report queue_index to xdp_rxq_info */ xdp_convert_frame_to_buff(xdpf, &xdp); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 3aa002a47a96..482d284a1553 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -678,7 +678,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { int err; @@ -701,7 +701,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct sk_buff *nskb; int err; @@ -720,8 +720,8 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, } int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress) + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index a5c9359d516f..ede31601a363 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -7,4 +7,5 @@ obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o obj-$(CONFIG_CGROUP_MISC) += misc.o +obj-$(CONFIG_CGROUP_DMEM) += dmem.o obj-$(CONFIG_CGROUP_DEBUG) += debug.o diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c new file mode 100644 index 000000000000..fbe34299673d --- /dev/null +++ b/kernel/cgroup/dmem.c @@ -0,0 +1,857 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>) + * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>) + * Partially based on the rdma and misc controllers, which bear the following copyrights: + * + * Copyright 2020 Google LLC + * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> + */ + +#include <linux/cgroup.h> +#include <linux/cgroup_dmem.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/page_counter.h> +#include <linux/parser.h> +#include <linux/slab.h> + +struct dmem_cgroup_region { + /** + * @ref: References keeping the region alive. + * Keeps the region reference alive after a succesful RCU lookup. + */ + struct kref ref; + + /** @rcu: RCU head for freeing */ + struct rcu_head rcu; + + /** + * @region_node: Linked into &dmem_cgroup_regions list. + * Protected by RCU and global spinlock. + */ + struct list_head region_node; + + /** + * @pools: List of pools linked to this region. + * Protected by global spinlock only + */ + struct list_head pools; + + /** @size: Size of region, in bytes */ + u64 size; + + /** @name: Name describing the node, set by dmem_cgroup_register_region */ + char *name; + + /** + * @unregistered: Whether the region is unregistered by its caller. + * No new pools should be added to the region afterwards. + */ + bool unregistered; +}; + +struct dmemcg_state { + struct cgroup_subsys_state css; + + struct list_head pools; +}; + +struct dmem_cgroup_pool_state { + struct dmem_cgroup_region *region; + struct dmemcg_state *cs; + + /* css node, RCU protected against region teardown */ + struct list_head css_node; + + /* dev node, no RCU protection required */ + struct list_head region_node; + + struct rcu_head rcu; + + struct page_counter cnt; + + bool inited; +}; + +/* + * 3 operations require locking protection: + * - Registering and unregistering region to/from list, requires global lock. + * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed. + * - Adding a dmem_cgroup_pool_state to a region list. + * + * Since for the most common operations RCU provides enough protection, I + * do not think more granular locking makes sense. Most protection is offered + * by RCU and the lockless operating page_counter. + */ +static DEFINE_SPINLOCK(dmemcg_lock); +static LIST_HEAD(dmem_cgroup_regions); + +static inline struct dmemcg_state * +css_to_dmemcs(struct cgroup_subsys_state *css) +{ + return container_of(css, struct dmemcg_state, css); +} + +static inline struct dmemcg_state *get_current_dmemcs(void) +{ + return css_to_dmemcs(task_get_css(current, dmem_cgrp_id)); +} + +static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg) +{ + return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL; +} + +static void free_cg_pool(struct dmem_cgroup_pool_state *pool) +{ + list_del(&pool->region_node); + kfree(pool); +} + +static void +set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_min(&pool->cnt, val); +} + +static void +set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_low(&pool->cnt, val); +} + +static void +set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val) +{ + page_counter_set_max(&pool->cnt, val); +} + +static u64 get_resource_low(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.low) : 0; +} + +static u64 get_resource_min(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.min) : 0; +} + +static u64 get_resource_max(struct dmem_cgroup_pool_state *pool) +{ + return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX; +} + +static u64 get_resource_current(struct dmem_cgroup_pool_state *pool) +{ + return pool ? page_counter_read(&pool->cnt) : 0; +} + +static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool) +{ + set_resource_min(rpool, 0); + set_resource_low(rpool, 0); + set_resource_max(rpool, PAGE_COUNTER_MAX); +} + +static void dmemcs_offline(struct cgroup_subsys_state *css) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(css); + struct dmem_cgroup_pool_state *pool; + + rcu_read_lock(); + list_for_each_entry_rcu(pool, &dmemcs->pools, css_node) + reset_all_resource_limits(pool); + rcu_read_unlock(); +} + +static void dmemcs_free(struct cgroup_subsys_state *css) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(css); + struct dmem_cgroup_pool_state *pool, *next; + + spin_lock(&dmemcg_lock); + list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) { + /* + *The pool is dead and all references are 0, + * no need for RCU protection with list_del_rcu or freeing. + */ + list_del(&pool->css_node); + free_cg_pool(pool); + } + spin_unlock(&dmemcg_lock); + + kfree(dmemcs); +} + +static struct cgroup_subsys_state * +dmemcs_alloc(struct cgroup_subsys_state *parent_css) +{ + struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL); + if (!dmemcs) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&dmemcs->pools); + return &dmemcs->css; +} + +static struct dmem_cgroup_pool_state * +find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region) +{ + struct dmem_cgroup_pool_state *pool; + + list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock)) + if (pool->region == region) + return pool; + + return NULL; +} + +static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool) +{ + if (!pool->cnt.parent) + return NULL; + + return container_of(pool->cnt.parent, typeof(*pool), cnt); +} + +static void +dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool) +{ + struct page_counter *climit; + struct cgroup_subsys_state *css, *next_css; + struct dmemcg_state *dmemcg_iter; + struct dmem_cgroup_pool_state *pool, *parent_pool; + bool found_descendant; + + climit = &limit_pool->cnt; + + rcu_read_lock(); + parent_pool = pool = limit_pool; + css = &limit_pool->cs->css; + + /* + * This logic is roughly equivalent to css_foreach_descendant_pre, + * except we also track the parent pool to find out which pool we need + * to calculate protection values for. + * + * We can stop the traversal once we find test_pool among the + * descendants since we don't really care about any others. + */ + while (pool != test_pool) { + next_css = css_next_child(NULL, css); + if (next_css) { + parent_pool = pool; + } else { + while (css != &limit_pool->cs->css) { + next_css = css_next_child(css, css->parent); + if (next_css) + break; + css = css->parent; + parent_pool = pool_parent(parent_pool); + } + /* + * We can only hit this when test_pool is not a + * descendant of limit_pool. + */ + if (WARN_ON_ONCE(css == &limit_pool->cs->css)) + break; + } + css = next_css; + + found_descendant = false; + dmemcg_iter = container_of(css, struct dmemcg_state, css); + + list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) { + if (pool_parent(pool) == parent_pool) { + found_descendant = true; + break; + } + } + if (!found_descendant) + continue; + + page_counter_calculate_protection( + climit, &pool->cnt, true); + } + rcu_read_unlock(); +} + +/** + * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool + * @limit_pool: The pool for which we hit limits + * @test_pool: The pool for which to test + * @ignore_low: Whether we have to respect low watermarks. + * @ret_hit_low: Pointer to whether it makes sense to consider low watermark. + * + * This function returns true if we can evict from @test_pool, false if not. + * When returning false and @ignore_low is false, @ret_hit_low may + * be set to true to indicate this function can be retried with @ignore_low + * set to true. + * + * Return: bool + */ +bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool, + bool ignore_low, bool *ret_hit_low) +{ + struct dmem_cgroup_pool_state *pool = test_pool; + struct page_counter *ctest; + u64 used, min, low; + + /* Can always evict from current pool, despite limits */ + if (limit_pool == test_pool) + return true; + + if (limit_pool) { + if (!parent_dmemcs(limit_pool->cs)) + return true; + + for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool)) + {} + + if (!pool) + return false; + } else { + /* + * If there is no cgroup limiting memory usage, use the root + * cgroup instead for limit calculations. + */ + for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool)) + {} + } + + ctest = &test_pool->cnt; + + dmem_cgroup_calculate_protection(limit_pool, test_pool); + + used = page_counter_read(ctest); + min = READ_ONCE(ctest->emin); + + if (used <= min) + return false; + + if (!ignore_low) { + low = READ_ONCE(ctest->elow); + if (used > low) + return true; + + *ret_hit_low = true; + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable); + +static struct dmem_cgroup_pool_state * +alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, + struct dmem_cgroup_pool_state **allocpool) +{ + struct dmemcg_state *parent = parent_dmemcs(dmemcs); + struct dmem_cgroup_pool_state *pool, *ppool = NULL; + + if (!*allocpool) { + pool = kzalloc(sizeof(*pool), GFP_NOWAIT); + if (!pool) + return ERR_PTR(-ENOMEM); + } else { + pool = *allocpool; + *allocpool = NULL; + } + + pool->region = region; + pool->cs = dmemcs; + + if (parent) + ppool = find_cg_pool_locked(parent, region); + + page_counter_init(&pool->cnt, + ppool ? &ppool->cnt : NULL, true); + reset_all_resource_limits(pool); + + list_add_tail_rcu(&pool->css_node, &dmemcs->pools); + list_add_tail(&pool->region_node, ®ion->pools); + + if (!parent) + pool->inited = true; + else + pool->inited = ppool ? ppool->inited : false; + return pool; +} + +static struct dmem_cgroup_pool_state * +get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region, + struct dmem_cgroup_pool_state **allocpool) +{ + struct dmem_cgroup_pool_state *pool, *ppool, *retpool; + struct dmemcg_state *p, *pp; + + /* + * Recursively create pool, we may not initialize yet on + * recursion, this is done as a separate step. + */ + for (p = dmemcs; p; p = parent_dmemcs(p)) { + pool = find_cg_pool_locked(p, region); + if (!pool) + pool = alloc_pool_single(p, region, allocpool); + + if (IS_ERR(pool)) + return pool; + + if (p == dmemcs && pool->inited) + return pool; + + if (pool->inited) + break; + } + + retpool = pool = find_cg_pool_locked(dmemcs, region); + for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) { + if (pool->inited) + break; + + /* ppool was created if it didn't exist by above loop. */ + ppool = find_cg_pool_locked(pp, region); + + /* Fix up parent links, mark as inited. */ + pool->cnt.parent = &ppool->cnt; + pool->inited = true; + + pool = ppool; + } + + return retpool; +} + +static void dmemcg_free_rcu(struct rcu_head *rcu) +{ + struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu); + struct dmem_cgroup_pool_state *pool, *next; + + list_for_each_entry_safe(pool, next, ®ion->pools, region_node) + free_cg_pool(pool); + kfree(region->name); + kfree(region); +} + +static void dmemcg_free_region(struct kref *ref) +{ + struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref); + + call_rcu(&cgregion->rcu, dmemcg_free_rcu); +} + +/** + * dmem_cgroup_unregister_region() - Unregister a previously registered region. + * @region: The region to unregister. + * + * This function undoes dmem_cgroup_register_region. + */ +void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) +{ + struct list_head *entry; + + if (!region) + return; + + spin_lock(&dmemcg_lock); + + /* Remove from global region list */ + list_del_rcu(®ion->region_node); + + list_for_each_rcu(entry, ®ion->pools) { + struct dmem_cgroup_pool_state *pool = + container_of(entry, typeof(*pool), region_node); + + list_del_rcu(&pool->css_node); + } + + /* + * Ensure any RCU based lookups fail. Additionally, + * no new pools should be added to the dead region + * by get_cg_pool_unlocked. + */ + region->unregistered = true; + spin_unlock(&dmemcg_lock); + + kref_put(®ion->ref, dmemcg_free_region); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region); + +/** + * dmem_cgroup_register_region() - Register a regions for dev cgroup. + * @size: Size of region to register, in bytes. + * @fmt: Region parameters to register + * + * This function registers a node in the dmem cgroup with the + * name given. After calling this function, the region can be + * used for allocations. + * + * Return: NULL or a struct on success, PTR_ERR on failure. + */ +struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...) +{ + struct dmem_cgroup_region *ret; + char *region_name; + va_list ap; + + if (!size) + return NULL; + + va_start(ap, fmt); + region_name = kvasprintf(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!region_name) + return ERR_PTR(-ENOMEM); + + ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) { + kfree(region_name); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&ret->pools); + ret->name = region_name; + ret->size = size; + kref_init(&ret->ref); + + spin_lock(&dmemcg_lock); + list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions); + spin_unlock(&dmemcg_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_register_region); + +static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name) +{ + struct dmem_cgroup_region *region; + + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock)) + if (!strcmp(name, region->name) && + kref_get_unless_zero(®ion->ref)) + return region; + + return NULL; +} + +/** + * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state + * @pool: &dmem_cgroup_pool_state + * + * Called to drop a reference to the limiting pool returned by + * dmem_cgroup_try_charge(). + */ +void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) +{ + if (pool) + css_put(&pool->cs->css); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put); + +static struct dmem_cgroup_pool_state * +get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region) +{ + struct dmem_cgroup_pool_state *pool, *allocpool = NULL; + + /* fastpath lookup? */ + rcu_read_lock(); + pool = find_cg_pool_locked(cg, region); + if (pool && !READ_ONCE(pool->inited)) + pool = NULL; + rcu_read_unlock(); + + while (!pool) { + spin_lock(&dmemcg_lock); + if (!region->unregistered) + pool = get_cg_pool_locked(cg, region, &allocpool); + else + pool = ERR_PTR(-ENODEV); + spin_unlock(&dmemcg_lock); + + if (pool == ERR_PTR(-ENOMEM)) { + pool = NULL; + if (WARN_ON(allocpool)) + continue; + + allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL); + if (allocpool) { + pool = NULL; + continue; + } + } + } + + kfree(allocpool); + return pool; +} + +/** + * dmem_cgroup_uncharge() - Uncharge a pool. + * @pool: Pool to uncharge. + * @size: Size to uncharge. + * + * Undoes the effects of dmem_cgroup_try_charge. + * Must be called with the returned pool as argument, + * and same @index and @size. + */ +void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) +{ + if (!pool) + return; + + page_counter_uncharge(&pool->cnt, size); + css_put(&pool->cs->css); +} +EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge); + +/** + * dmem_cgroup_try_charge() - Try charging a new allocation to a region. + * @region: dmem region to charge + * @size: Size (in bytes) to charge. + * @ret_pool: On succesfull allocation, the pool that is charged. + * @ret_limit_pool: On a failed allocation, the limiting pool. + * + * This function charges the @region region for a size of @size bytes. + * + * If the function succeeds, @ret_pool is set, which must be passed to + * dmem_cgroup_uncharge() when undoing the allocation. + * + * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it + * will be set to the pool for which the limit is hit. This can be used for + * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed + * with @dmem_cgroup_pool_state_put(). + * + * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure. + */ +int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool) +{ + struct dmemcg_state *cg; + struct dmem_cgroup_pool_state *pool; + struct page_counter *fail; + int ret; + + *ret_pool = NULL; + if (ret_limit_pool) + *ret_limit_pool = NULL; + + /* + * hold on to css, as cgroup can be removed but resource + * accounting happens on css. + */ + cg = get_current_dmemcs(); + + pool = get_cg_pool_unlocked(cg, region); + if (IS_ERR(pool)) { + ret = PTR_ERR(pool); + goto err; + } + + if (!page_counter_try_charge(&pool->cnt, size, &fail)) { + if (ret_limit_pool) { + *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt); + css_get(&(*ret_limit_pool)->cs->css); + } + ret = -EAGAIN; + goto err; + } + + /* On success, reference from get_current_dmemcs is transferred to *ret_pool */ + *ret_pool = pool; + return 0; + +err: + css_put(&cg->css); + return ret; +} +EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge); + +static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v) +{ + struct dmem_cgroup_region *region; + + rcu_read_lock(); + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { + seq_puts(sf, region->name); + seq_printf(sf, " %llu\n", region->size); + } + rcu_read_unlock(); + return 0; +} + +static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region, + u64 *new_limit) +{ + char *end; + + if (!strcmp(options, "max")) { + *new_limit = PAGE_COUNTER_MAX; + return 0; + } + + *new_limit = memparse(options, &end); + if (*end != '\0') + return -EINVAL; + + return 0; +} + +static ssize_t dmemcg_limit_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + void (*apply)(struct dmem_cgroup_pool_state *, u64)) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of)); + int err = 0; + + while (buf && !err) { + struct dmem_cgroup_pool_state *pool = NULL; + char *options, *region_name; + struct dmem_cgroup_region *region; + u64 new_limit; + + options = buf; + buf = strchr(buf, '\n'); + if (buf) + *buf++ = '\0'; + + options = strstrip(options); + + /* eat empty lines */ + if (!options[0]) + continue; + + region_name = strsep(&options, " \t"); + if (!region_name[0]) + continue; + + rcu_read_lock(); + region = dmemcg_get_region_by_name(region_name); + rcu_read_unlock(); + + if (!region) + return -EINVAL; + + err = dmemcg_parse_limit(options, region, &new_limit); + if (err < 0) + goto out_put; + + pool = get_cg_pool_unlocked(dmemcs, region); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + goto out_put; + } + + /* And commit */ + apply(pool, new_limit); + +out_put: + kref_put(®ion->ref, dmemcg_free_region); + } + + + return err ?: nbytes; +} + +static int dmemcg_limit_show(struct seq_file *sf, void *v, + u64 (*fn)(struct dmem_cgroup_pool_state *)) +{ + struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf)); + struct dmem_cgroup_region *region; + + rcu_read_lock(); + list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) { + struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region); + u64 val; + + seq_puts(sf, region->name); + + val = fn(pool); + if (val < PAGE_COUNTER_MAX) + seq_printf(sf, " %lld\n", val); + else + seq_puts(sf, " max\n"); + } + rcu_read_unlock(); + + return 0; +} + +static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_current); +} + +static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_min); +} + +static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min); +} + +static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_low); +} + +static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low); +} + +static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v) +{ + return dmemcg_limit_show(sf, v, get_resource_max); +} + +static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max); +} + +static struct cftype files[] = { + { + .name = "capacity", + .seq_show = dmem_cgroup_region_capacity_show, + .flags = CFTYPE_ONLY_ON_ROOT, + }, + { + .name = "current", + .seq_show = dmem_cgroup_region_current_show, + }, + { + .name = "min", + .write = dmem_cgroup_region_min_write, + .seq_show = dmem_cgroup_region_min_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "low", + .write = dmem_cgroup_region_low_write, + .seq_show = dmem_cgroup_region_low_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "max", + .write = dmem_cgroup_region_max_write, + .seq_show = dmem_cgroup_region_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { } /* Zero entry terminates. */ +}; + +struct cgroup_subsys dmem_cgrp_subsys = { + .css_alloc = dmemcs_alloc, + .css_free = dmemcs_free, + .css_offline = dmemcs_offline, + .legacy_cftypes = files, + .dfl_cftypes = files, +}; diff --git a/kernel/cpu.c b/kernel/cpu.c index b605334f8ee6..0509a9733745 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2179,7 +2179,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { }, [CPUHP_AP_HRTIMERS_DYING] = { .name = "hrtimers:dying", - .startup.single = NULL, + .startup.single = hrtimers_cpu_starting, .teardown.single = hrtimers_cpu_dying, }, [CPUHP_AP_TICK_DYING] = { diff --git a/kernel/cred.c b/kernel/cred.c index da7da250f7c8..9676965c0981 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -477,56 +477,6 @@ void abort_creds(struct cred *new) EXPORT_SYMBOL(abort_creds); /** - * override_creds - Override the current process's subjective credentials - * @new: The credentials to be assigned - * - * Install a set of temporary override subjective credentials on the current - * process, returning the old set for later reversion. - */ -const struct cred *override_creds(const struct cred *new) -{ - const struct cred *old; - - kdebug("override_creds(%p{%ld})", new, - atomic_long_read(&new->usage)); - - /* - * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. - * - * That means that we do not clear the 'non_rcu' flag, since - * we are only installing the cred into the thread-synchronous - * '->cred' pointer, not the '->real_cred' pointer that is - * visible to other threads under RCU. - */ - get_new_cred((struct cred *)new); - old = override_creds_light(new); - - kdebug("override_creds() = %p{%ld}", old, - atomic_long_read(&old->usage)); - return old; -} -EXPORT_SYMBOL(override_creds); - -/** - * revert_creds - Revert a temporary subjective credentials override - * @old: The credentials to be restored - * - * Revert a temporary set of override subjective credentials to an old set, - * discarding the override set. - */ -void revert_creds(const struct cred *old) -{ - const struct cred *override = current->cred; - - kdebug("revert_creds(%p{%ld})", old, - atomic_long_read(&old->usage)); - - revert_creds_light(old); - put_cred(override); -} -EXPORT_SYMBOL(revert_creds); - -/** * cred_fscmp - Compare two credentials with respect to filesystem access. * @a: The first credential * @b: The second credential diff --git a/kernel/events/core.c b/kernel/events/core.c index 065f9188b44a..bcb09e011e9e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6277,41 +6277,6 @@ unlock: } EXPORT_SYMBOL_GPL(perf_event_update_userpage); -static vm_fault_t perf_mmap_fault(struct vm_fault *vmf) -{ - struct perf_event *event = vmf->vma->vm_file->private_data; - struct perf_buffer *rb; - vm_fault_t ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) - goto unlock; - - vmf->page = perf_mmap_to_page(rb, vmf->pgoff); - if (!vmf->page) - goto unlock; - - get_page(vmf->page); - vmf->page->mapping = vmf->vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *rb) { @@ -6551,13 +6516,87 @@ out_put: ring_buffer_put(rb); /* could be last */ } +static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) +{ + /* The first page is the user control page, others are read-only. */ + return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, + .pfn_mkwrite = perf_mmap_pfn_mkwrite, }; +static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) +{ + unsigned long nr_pages = vma_pages(vma); + int err = 0; + unsigned long pagenum; + + /* + * We map this as a VM_PFNMAP VMA. + * + * This is not ideal as this is designed broadly for mappings of PFNs + * referencing memory-mapped I/O ranges or non-system RAM i.e. for which + * !pfn_valid(pfn). + * + * We are mapping kernel-allocated memory (memory we manage ourselves) + * which would more ideally be mapped using vm_insert_page() or a + * similar mechanism, that is as a VM_MIXEDMAP mapping. + * + * However this won't work here, because: + * + * 1. It uses vma->vm_page_prot, but this field has not been completely + * setup at the point of the f_op->mmp() hook, so we are unable to + * indicate that this should be mapped CoW in order that the + * mkwrite() hook can be invoked to make the first page R/W and the + * rest R/O as desired. + * + * 2. Anything other than a VM_PFNMAP of valid PFNs will result in + * vm_normal_page() returning a struct page * pointer, which means + * vm_ops->page_mkwrite() will be invoked rather than + * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping + * to work around retry logic in the fault handler, however this + * field is no longer allowed to be used within struct page. + * + * 3. Having a struct page * made available in the fault logic also + * means that the page gets put on the rmap and becomes + * inappropriately accessible and subject to map and ref counting. + * + * Ideally we would have a mechanism that could explicitly express our + * desires, but this is not currently the case, so we instead use + * VM_PFNMAP. + * + * We manage the lifetime of these mappings with internal refcounts (see + * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of + * this mapping is maintained correctly. + */ + for (pagenum = 0; pagenum < nr_pages; pagenum++) { + unsigned long va = vma->vm_start + PAGE_SIZE * pagenum; + struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum); + + if (page == NULL) { + err = -EINVAL; + break; + } + + /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */ + err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE, + vm_get_page_prot(vma->vm_flags & ~VM_SHARED)); + if (err) + break; + } + +#ifdef CONFIG_MMU + /* Clear any partial mappings on error. */ + if (err) + zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL); +#endif + + return err; +} + static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_event *event = file->private_data; @@ -6682,6 +6721,8 @@ again: goto again; } + /* We need the rb to map pages. */ + rb = event->rb; goto unlock; } @@ -6776,6 +6817,9 @@ aux_unlock: vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; + if (!ret) + ret = map_range(rb, vma); + if (event->pmu->event_mapped) event->pmu->event_mapped(event, vma->vm_mm); @@ -10039,8 +10083,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, data, regs); } -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) +int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; @@ -10425,9 +10468,9 @@ static struct pmu perf_tracepoint = { }; static int perf_tp_filter_match(struct perf_event *event, - struct perf_sample_data *data) + struct perf_raw_record *raw) { - void *record = data->raw->frag.data; + void *record = raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -10439,7 +10482,7 @@ static int perf_tp_filter_match(struct perf_event *event, } static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data, + struct perf_raw_record *raw, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) @@ -10450,7 +10493,7 @@ static int perf_tp_event_match(struct perf_event *event, if (event->attr.exclude_kernel && !user_mode(regs)) return 0; - if (!perf_tp_filter_match(event, data)) + if (!perf_tp_filter_match(event, raw)) return 0; return 1; @@ -10476,6 +10519,7 @@ EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); static void __perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, + struct perf_raw_record *raw, struct perf_event *event) { struct trace_entry *entry = record; @@ -10485,13 +10529,17 @@ static void __perf_tp_event_target_task(u64 count, void *record, /* Cannot deliver synchronous signal to other task. */ if (event->attr.sigtrap) return; - if (perf_tp_event_match(event, data, regs)) + if (perf_tp_event_match(event, raw, regs)) { + perf_sample_data_init(data, 0, 0); + perf_sample_save_raw_data(data, event, raw); perf_swevent_event(event, count, data, regs); + } } static void perf_tp_event_target_task(u64 count, void *record, struct pt_regs *regs, struct perf_sample_data *data, + struct perf_raw_record *raw, struct perf_event_context *ctx) { unsigned int cpu = smp_processor_id(); @@ -10499,15 +10547,15 @@ static void perf_tp_event_target_task(u64 count, void *record, struct perf_event *event, *sibling; perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { - __perf_tp_event_target_task(count, record, regs, data, event); + __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) - __perf_tp_event_target_task(count, record, regs, data, sibling); + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { - __perf_tp_event_target_task(count, record, regs, data, event); + __perf_tp_event_target_task(count, record, regs, data, raw, event); for_each_sibling_event(sibling, event) - __perf_tp_event_target_task(count, record, regs, data, sibling); + __perf_tp_event_target_task(count, record, regs, data, raw, sibling); } } @@ -10525,15 +10573,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, }, }; - perf_sample_data_init(&data, 0, 0); - perf_sample_save_raw_data(&data, &raw); - perf_trace_buf_update(record, event_type); hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) { - perf_swevent_event(event, count, &data, regs); - + if (perf_tp_event_match(event, &raw, regs)) { /* * Here use the same on-stack perf_sample_data, * some members in data are event-specific and @@ -10543,7 +10586,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, * because data->sample_flags is set. */ perf_sample_data_init(&data, 0, 0); - perf_sample_save_raw_data(&data, &raw); + perf_sample_save_raw_data(&data, event, &raw); + perf_swevent_event(event, count, &data, regs); } } @@ -10560,7 +10604,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, goto unlock; raw_spin_lock(&ctx->lock); - perf_tp_event_target_task(count, record, regs, &data, ctx); + perf_tp_event_target_task(count, record, regs, &data, &raw, ctx); raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4f46f688d0d4..180509132d4b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -643,7 +643,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx) struct page *page = virt_to_page(rb->aux_pages[idx]); ClearPagePrivate(page); - page->mapping = NULL; __free_page(page); } @@ -819,7 +818,6 @@ static void perf_mmap_free_page(void *addr) { struct page *page = virt_to_page(addr); - page->mapping = NULL; __free_page(page); } @@ -890,28 +888,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff) return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); } -static void perf_mmap_unmark_page(void *addr) -{ - struct page *page = vmalloc_to_page(addr); - - page->mapping = NULL; -} - static void rb_free_work(struct work_struct *work) { struct perf_buffer *rb; - void *base; - int i, nr; rb = container_of(work, struct perf_buffer, work); - nr = data_page_nr(rb); - - base = rb->user_page; - /* The '<=' counts in the user page. */ - for (i = 0; i <= nr; i++) - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); - vfree(base); + vfree(rb->user_page); kfree(rb); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5d71ef85420c..e421a5f2ec7d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1888,9 +1888,33 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } -static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) +static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) { - struct return_instance *next = ri->next; + ri->cons_cnt = 0; + ri->next = utask->ri_pool; + utask->ri_pool = ri; +} + +static struct return_instance *ri_pool_pop(struct uprobe_task *utask) +{ + struct return_instance *ri = utask->ri_pool; + + if (likely(ri)) + utask->ri_pool = ri->next; + + return ri; +} + +static void ri_free(struct return_instance *ri) +{ + kfree(ri->extra_consumers); + kfree_rcu(ri, rcu); +} + +static void free_ret_instance(struct uprobe_task *utask, + struct return_instance *ri, bool cleanup_hprobe) +{ + unsigned seq; if (cleanup_hprobe) { enum hprobe_state hstate; @@ -1899,8 +1923,22 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo hprobe_finalize(&ri->hprobe, hstate); } - kfree_rcu(ri, rcu); - return next; + /* + * At this point return_instance is unlinked from utask's + * return_instances list and this has become visible to ri_timer(). + * If seqcount now indicates that ri_timer's return instance + * processing loop isn't active, we can return ri into the pool of + * to-be-reused return instances for future uretprobes. If ri_timer() + * happens to be running right now, though, we fallback to safety and + * just perform RCU-delated freeing of ri. + */ + if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { + /* immediate reuse of ri without RCU GP is OK */ + ri_pool_push(utask, ri); + } else { + /* we might be racing with ri_timer(), so play it safe */ + ri_free(ri); + } } /* @@ -1910,7 +1948,7 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri; + struct return_instance *ri, *ri_next; if (!utask) return; @@ -1921,8 +1959,19 @@ void uprobe_free_utask(struct task_struct *t) timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; - while (ri) - ri = free_ret_instance(ri, true /* cleanup_hprobe */); + while (ri) { + ri_next = ri->next; + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; + } + + /* free_ret_instance() above might add to ri_pool, so this loop should come last */ + ri = utask->ri_pool; + while (ri) { + ri_next = ri->next; + ri_free(ri); + ri = ri_next; + } kfree(utask); } @@ -1942,8 +1991,12 @@ static void ri_timer(struct timer_list *timer) /* RCU protects return_instance from freeing. */ guard(rcu)(); + write_seqcount_begin(&utask->ri_seqcount); + for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); + + write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) @@ -1955,6 +2008,7 @@ static struct uprobe_task *alloc_utask(void) return NULL; timer_setup(&utask->ri_timer, ri_timer, 0); + seqcount_init(&utask->ri_seqcount); return utask; } @@ -1974,32 +2028,40 @@ static struct uprobe_task *get_utask(void) return current->utask; } -static size_t ri_size(int consumers_cnt) +static struct return_instance *alloc_return_instance(struct uprobe_task *utask) { struct return_instance *ri; - return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt; -} - -#define DEF_CNT 4 - -static struct return_instance *alloc_return_instance(void) -{ - struct return_instance *ri; + ri = ri_pool_pop(utask); + if (ri) + return ri; - ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL); + ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return ZERO_SIZE_PTR; - ri->consumers_cnt = DEF_CNT; return ri; } static struct return_instance *dup_return_instance(struct return_instance *old) { - size_t size = ri_size(old->consumers_cnt); + struct return_instance *ri; + + ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); + if (!ri) + return NULL; - return kmemdup(old, size, GFP_KERNEL); + if (unlikely(old->cons_cnt > 1)) { + ri->extra_consumers = kmemdup(old->extra_consumers, + sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), + GFP_KERNEL); + if (!ri->extra_consumers) { + kfree(ri); + return NULL; + } + } + + return ri; } static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) @@ -2108,14 +2170,17 @@ unsigned long uprobe_get_trampoline_vaddr(void) static void cleanup_return_instances(struct uprobe_task *utask, bool chained, struct pt_regs *regs) { - struct return_instance *ri = utask->return_instances; + struct return_instance *ri = utask->return_instances, *ri_next; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { - ri = free_ret_instance(ri, true /* cleanup_hprobe */); + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; + + free_ret_instance(utask, ri, true /* cleanup_hprobe */); + ri = ri_next; } - rcu_assign_pointer(utask->return_instances, ri); } static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, @@ -2180,7 +2245,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, return; free: - kfree(ri); + ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ @@ -2294,6 +2359,47 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) return is_trap_insn(&opcode); } +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) +{ + struct mm_struct *mm = current->mm; + struct uprobe *uprobe = NULL; + struct vm_area_struct *vma; + struct file *vm_file; + loff_t offset; + unsigned int seq; + + guard(rcu)(); + + if (!mmap_lock_speculate_try_begin(mm, &seq)) + return NULL; + + vma = vma_lookup(mm, bp_vaddr); + if (!vma) + return NULL; + + /* + * vm_file memory can be reused for another instance of struct file, + * but can't be freed from under us, so it's safe to read fields from + * it, even if the values are some garbage values; ultimately + * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure + * that whatever we speculatively found is correct + */ + vm_file = READ_ONCE(vma->vm_file); + if (!vm_file) + return NULL; + + offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); + uprobe = find_uprobe_rcu(vm_file->f_inode, offset); + if (!uprobe) + return NULL; + + /* now double check that nothing about MM changed */ + if (mmap_lock_speculate_retry(mm, seq)) + return NULL; + + return uprobe; +} + /* assumes being inside RCU protected region */ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { @@ -2301,10 +2407,14 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb struct uprobe *uprobe = NULL; struct vm_area_struct *vma; + uprobe = find_active_uprobe_speculative(bp_vaddr); + if (uprobe) + return uprobe; + mmap_read_lock(mm); vma = vma_lookup(mm, bp_vaddr); if (vma) { - if (valid_vma(vma, false)) { + if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); @@ -2324,25 +2434,27 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb return uprobe; } -static struct return_instance* -push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie) +static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) { + struct return_consumer *ric; + if (unlikely(ri == ZERO_SIZE_PTR)) return ri; - if (unlikely(idx >= ri->consumers_cnt)) { - struct return_instance *old_ri = ri; - - ri->consumers_cnt += DEF_CNT; - ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL); - if (!ri) { - kfree(old_ri); + if (unlikely(ri->cons_cnt > 0)) { + ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); + if (!ric) { + ri_free(ri); return ZERO_SIZE_PTR; } + ri->extra_consumers = ric; } - ri->consumers[idx].id = id; - ri->consumers[idx].cookie = cookie; + ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; + ric->id = id; + ric->cookie = cookie; + + ri->cons_cnt++; return ri; } @@ -2350,14 +2462,17 @@ static struct return_consumer * return_consumer_find(struct return_instance *ri, int *iter, int id) { struct return_consumer *ric; - int idx = *iter; + int idx; - for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) { + for (idx = *iter; idx < ri->cons_cnt; idx++) + { + ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; if (ric->id == id) { *iter = idx + 1; return ric; } } + return NULL; } @@ -2371,9 +2486,9 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) struct uprobe_consumer *uc; bool has_consumers = false, remove = true; struct return_instance *ri = NULL; - int push_idx = 0; + struct uprobe_task *utask = current->utask; - current->utask->auprobe = &uprobe->arch; + utask->auprobe = &uprobe->arch; list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; @@ -2393,21 +2508,15 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) continue; if (!ri) - ri = alloc_return_instance(); + ri = alloc_return_instance(utask); if (session) - ri = push_consumer(ri, push_idx++, uc->id, cookie); + ri = push_consumer(ri, uc->id, cookie); } - current->utask->auprobe = NULL; + utask->auprobe = NULL; - if (!ZERO_OR_NULL_PTR(ri)) { - /* - * The push_idx value has the final number of return consumers, - * and ri->consumers_cnt has number of allocated consumers. - */ - ri->consumers_cnt = push_idx; + if (!ZERO_OR_NULL_PTR(ri)) prepare_uretprobe(uprobe, regs, ri); - } if (remove && has_consumers) { down_read(&uprobe->register_rwsem); @@ -2461,7 +2570,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri) void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; - struct return_instance *ri, *next; + struct return_instance *ri, *ri_next, *next_chain; struct uprobe *uprobe; enum hprobe_state hstate; bool valid; @@ -2481,8 +2590,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * or NULL; the latter case means that nobody but ri->func * could hit this trampoline on return. TODO: sigaltstack(). */ - next = find_next_ret_chain(ri); - valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); + next_chain = find_next_ret_chain(ri); + valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { @@ -2494,7 +2603,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * trampoline addresses on the stack are replaced with correct * original return addresses */ - rcu_assign_pointer(utask->return_instances, ri->next); + ri_next = ri->next; + rcu_assign_pointer(utask->return_instances, ri_next); + utask->depth--; uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) @@ -2502,9 +2613,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs) hprobe_finalize(&ri->hprobe, hstate); /* We already took care of hprobe, no need to waste more time on that. */ - ri = free_ret_instance(ri, false /* !cleanup_hprobe */); - utask->depth--; - } while (ri != next); + free_ret_instance(utask, ri, false /* !cleanup_hprobe */); + ri = ri_next; + } while (ri != next_chain); } while (!valid); return; diff --git a/kernel/fork.c b/kernel/fork.c index 9b301180fd41..ded49f18cd95 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -448,7 +448,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma) return false; init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return true; } @@ -1262,9 +1262,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); -#ifdef CONFIG_PER_VMA_LOCK - mm->mm_lock_seq = 0; -#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index d62cca5ed8f4..daea650b16f5 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -1020,10 +1020,7 @@ retry_private: * it sees the futex_q::pi_state. */ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); - preempt_disable(); - raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q); if (ret) { if (ret == 1) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 529adb1f5859..5432418c0fea 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -31,6 +31,10 @@ config GENERIC_IRQ_EFFECTIVE_AFF_MASK config GENERIC_PENDING_IRQ bool +# Deduce delayed migration from top-level interrupt chip flags +config GENERIC_PENDING_IRQ_CHIPFLAGS + bool + # Support for generic irq migrating off cpu before the cpu is offline. config GENERIC_IRQ_MIGRATION bool @@ -141,6 +145,12 @@ config GENERIC_IRQ_DEBUGFS If you don't know what to do here, say N. +# Clear forwarded VM interrupts during kexec. +# This option ensures the kernel clears active states for interrupts +# forwarded to virtual machines (VMs) during a machine kexec. +config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD + bool + endmenu config GENERIC_IRQ_MULTI_HANDLER diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index f19d3080bf11..c0f44c06d69d 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o +obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o kexec.o obj-$(CONFIG_IRQ_TIMINGS) += timings.o ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y) CFLAGS_timings.o += -DDEBUG diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 271e9139de77..c901436ebd9f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1114,13 +1114,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) trigger = irqd_get_trigger_type(&desc->irq_data); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | - IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); + IRQD_TRIGGER_MASK | IRQD_LEVEL); if (irq_settings_has_no_balance_set(desc)) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); if (irq_settings_is_per_cpu(desc)) irqd_set(&desc->irq_data, IRQD_PER_CPU); - if (irq_settings_can_move_pcntxt(desc)) - irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index c6ffb97966be..ca142b9a4db3 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -53,6 +53,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND), BIT_MASK_DESCR(IRQCHIP_IMMUTABLE), + BIT_MASK_DESCR(IRQCHIP_MOVE_DEFERRED), }; static void @@ -108,7 +109,6 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_NO_BALANCING), BIT_MASK_DESCR(IRQD_SINGLE_TARGET), - BIT_MASK_DESCR(IRQD_MOVE_PCNTXT), BIT_MASK_DESCR(IRQD_AFFINITY_SET), BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 32ffcbb87fa1..c4a8bca5f2b0 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -162,6 +162,7 @@ void irq_gc_mask_disable_and_ack_set(struct irq_data *d) irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } +EXPORT_SYMBOL_GPL(irq_gc_mask_disable_and_ack_set); /** * irq_gc_eoi - EOI interrupt diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fe0272cd84a5..a979523640d0 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -421,7 +421,7 @@ irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, #ifdef CONFIG_GENERIC_PENDING_IRQ static inline bool irq_can_move_pcntxt(struct irq_data *data) { - return irqd_can_move_in_process_context(data); + return !(data->chip->flags & IRQCHIP_MOVE_DEFERRED); } static inline bool irq_move_pending(struct irq_data *data) { @@ -441,10 +441,6 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc) { return desc->pending_mask; } -static inline bool handle_enforce_irqctx(struct irq_data *data) -{ - return irqd_is_handle_enforce_irqctx(data); -} bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear); #else /* CONFIG_GENERIC_PENDING_IRQ */ static inline bool irq_can_move_pcntxt(struct irq_data *data) @@ -471,10 +467,6 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) { return false; } -static inline bool handle_enforce_irqctx(struct irq_data *data) -{ - return false; -} #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 0253e77fcd9a..287830739783 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -708,7 +708,7 @@ int handle_irq_desc(struct irq_desc *desc) return -EINVAL; data = irq_desc_get_irq_data(desc); - if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data))) + if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); diff --git a/kernel/irq/kexec.c b/kernel/irq/kexec.c new file mode 100644 index 000000000000..1a3deffe6b5b --- /dev/null +++ b/kernel/irq/kexec.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/irqdesc.h> +#include <linux/irqnr.h> + +#include "internals.h" + +void machine_kexec_mask_interrupts(void) +{ + struct irq_desc *desc; + unsigned int i; + + for_each_irq_desc(i, desc) { + struct irq_chip *chip; + int check_eoi = 1; + + chip = irq_desc_get_chip(desc); + if (!chip || !irqd_is_started(&desc->irq_data)) + continue; + + if (IS_ENABLED(CONFIG_GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD)) { + /* + * First try to remove the active state from an interrupt which is forwarded + * to a VM. If the interrupt is not forwarded, try to EOI the interrupt. + */ + check_eoi = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false); + } + + if (check_eoi && chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data)) + chip->irq_eoi(&desc->irq_data); + + irq_shutdown(desc); + } +} diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f0803d6bd296..f300bb6be3bd 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1182,45 +1182,38 @@ out_unlock: } /* - * Interrupts which are not explicitly requested as threaded - * interrupts rely on the implicit bh/preempt disable of the hard irq - * context. So we need to disable bh here to avoid deadlocks and other - * side effects. + * Interrupts explicitly requested as threaded interrupts want to be + * preemptible - many of them need to sleep and wait for slow busses to + * complete. */ -static irqreturn_t -irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) +static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action) { - irqreturn_t ret; + irqreturn_t ret = action->thread_fn(action->irq, action->dev_id); - local_bh_disable(); - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_disable(); - ret = action->thread_fn(action->irq, action->dev_id); if (ret == IRQ_HANDLED) atomic_inc(&desc->threads_handled); irq_finalize_oneshot(desc, action); - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_enable(); - local_bh_enable(); return ret; } /* - * Interrupts explicitly requested as threaded interrupts want to be - * preemptible - many of them need to sleep and wait for slow busses to - * complete. + * Interrupts which are not explicitly requested as threaded + * interrupts rely on the implicit bh/preempt disable of the hard irq + * context. So we need to disable bh here to avoid deadlocks and other + * side effects. */ -static irqreturn_t irq_thread_fn(struct irq_desc *desc, - struct irqaction *action) +static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret; - ret = action->thread_fn(action->irq, action->dev_id); - if (ret == IRQ_HANDLED) - atomic_inc(&desc->threads_handled); - - irq_finalize_oneshot(desc, action); + local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); + ret = irq_thread_fn(desc, action); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); + local_bh_enable(); return ret; } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index b07a2d732ffb..1b7fa72968bd 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -53,7 +53,7 @@ static int irq_sw_resend(struct irq_desc *desc) * Validate whether this interrupt can be safely injected from * non interrupt context */ - if (handle_enforce_irqctx(&desc->irq_data)) + if (irqd_is_handle_enforce_irqctx(&desc->irq_data)) return -EINVAL; /* diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 7b7efb1a114b..00b3bd127692 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -11,7 +11,6 @@ enum { _IRQ_NOREQUEST = IRQ_NOREQUEST, _IRQ_NOTHREAD = IRQ_NOTHREAD, _IRQ_NOAUTOEN = IRQ_NOAUTOEN, - _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, @@ -142,11 +141,6 @@ static inline void irq_settings_set_noprobe(struct irq_desc *desc) desc->status_use_accessors |= _IRQ_NOPROBE; } -static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) -{ - return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; -} - static inline bool irq_settings_can_autoenable(struct irq_desc *desc) { return !(desc->status_use_accessors & _IRQ_NOAUTOEN); diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index c43e2ac2f8de..4b7315e99bd6 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -509,6 +509,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) /** * irq_timings_next_event - Return when the next event is supposed to arrive + * @now: current time * * During the last busy cycle, the number of interrupts is incremented * and stored in the irq_timings structure. This information is diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index 873f7c445488..cf4af5728307 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -435,13 +435,11 @@ static int __init kallsyms_test_init(void) { struct task_struct *t; - t = kthread_create(test_entry, NULL, "kallsyms_test"); + t = kthread_run_on_cpu(test_entry, NULL, 0, "kallsyms_test"); if (IS_ERR(t)) { pr_info("Create kallsyms selftest task failed\n"); return PTR_ERR(t); } - kthread_bind(t, 0); - wake_up_process(t); return 0; } diff --git a/kernel/kthread.c b/kernel/kthread.c index a5ac612b1609..6a034c76b6e9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -35,6 +35,9 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; +static LIST_HEAD(kthreads_hotplug); +static DEFINE_MUTEX(kthreads_hotplug_lock); + struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ @@ -53,6 +56,8 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + unsigned int node; + int started; int result; int (*threadfn)(void *); void *data; @@ -63,6 +68,9 @@ struct kthread { #endif /* To store the full name if task comm is truncated. */ char *full_name; + struct task_struct *task; + struct list_head hotplug_node; + struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { @@ -121,8 +129,11 @@ bool set_kthread_struct(struct task_struct *p) init_completion(&kthread->exited); init_completion(&kthread->parked); + INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; + kthread->task = p; + kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } @@ -313,6 +324,16 @@ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; + if (!list_empty(&kthread->hotplug_node)) { + mutex_lock(&kthreads_hotplug_lock); + list_del(&kthread->hotplug_node); + mutex_unlock(&kthreads_hotplug_lock); + + if (kthread->preferred_affinity) { + kfree(kthread->preferred_affinity); + kthread->preferred_affinity = NULL; + } + } do_exit(0); } EXPORT_SYMBOL(kthread_exit); @@ -338,6 +359,56 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) } EXPORT_SYMBOL(kthread_complete_and_exit); +static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) +{ + const struct cpumask *pref; + + if (kthread->preferred_affinity) { + pref = kthread->preferred_affinity; + } else { + if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) + return; + pref = cpumask_of_node(kthread->node); + } + + cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); + if (cpumask_empty(cpumask)) + cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); +} + +static void kthread_affine_node(void) +{ + struct kthread *kthread = to_kthread(current); + cpumask_var_t affinity; + + WARN_ON_ONCE(kthread_is_per_cpu(current)); + + if (kthread->node == NUMA_NO_NODE) { + housekeeping_affine(current, HK_TYPE_KTHREAD); + } else { + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { + WARN_ON_ONCE(1); + return; + } + + mutex_lock(&kthreads_hotplug_lock); + WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); + list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); + /* + * The node cpumask is racy when read from kthread() but: + * - a racing CPU going down will either fail on the subsequent + * call to set_cpus_allowed_ptr() or be migrated to housekeepers + * afterwards by the scheduler. + * - a racing CPU going up will be handled by kthreads_online_cpu() + */ + kthread_fetch_affinity(kthread, affinity); + set_cpus_allowed_ptr(current, affinity); + mutex_unlock(&kthreads_hotplug_lock); + + free_cpumask_var(affinity); + } +} + static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; @@ -368,7 +439,6 @@ static int kthread(void *_create) * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); @@ -382,6 +452,11 @@ static int kthread(void *_create) schedule_preempt_disabled(); preempt_enable(); + self->started = 1; + + if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) + kthread_affine_node(); + ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); @@ -540,7 +615,9 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { + struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); + WARN_ON_ONCE(kthread->started); } /** @@ -554,7 +631,9 @@ void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) */ void kthread_bind(struct task_struct *p, unsigned int cpu) { + struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); + WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); @@ -738,10 +817,11 @@ EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { + static const char comm[TASK_COMM_LEN] = "kthreadd"; struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ - set_task_comm(tsk, "kthreadd"); + set_task_comm(tsk, comm); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); @@ -774,6 +854,92 @@ int kthreadd(void *unused) return 0; } +int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) +{ + struct kthread *kthread = to_kthread(p); + cpumask_var_t affinity; + unsigned long flags; + int ret; + + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { + WARN_ON(1); + return -EINVAL; + } + + WARN_ON_ONCE(kthread->preferred_affinity); + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return -ENOMEM; + + kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); + if (!kthread->preferred_affinity) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&kthreads_hotplug_lock); + cpumask_copy(kthread->preferred_affinity, mask); + WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); + list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); + kthread_fetch_affinity(kthread, affinity); + + /* It's safe because the task is inactive. */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + do_set_cpus_allowed(p, affinity); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + mutex_unlock(&kthreads_hotplug_lock); +out: + free_cpumask_var(affinity); + + return 0; +} + +/* + * Re-affine kthreads according to their preferences + * and the newly online CPU. The CPU down part is handled + * by select_fallback_rq() which default re-affines to + * housekeepers from other nodes in case the preferred + * affinity doesn't apply anymore. + */ +static int kthreads_online_cpu(unsigned int cpu) +{ + cpumask_var_t affinity; + struct kthread *k; + int ret; + + guard(mutex)(&kthreads_hotplug_lock); + + if (list_empty(&kthreads_hotplug)) + return 0; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return -ENOMEM; + + ret = 0; + + list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { + if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || + kthread_is_per_cpu(k->task))) { + ret = -EINVAL; + continue; + } + kthread_fetch_affinity(k, affinity); + set_cpus_allowed_ptr(k->task, affinity); + } + + free_cpumask_var(affinity); + + return ret; +} + +static int kthreads_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", + kthreads_online_cpu, NULL); +} +early_initcall(kthreads_init); + void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) @@ -865,12 +1031,11 @@ repeat: EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * -__kthread_create_worker(int cpu, unsigned int flags, - const char namefmt[], va_list args) +__kthread_create_worker_on_node(unsigned int flags, int node, + const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; - int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) @@ -878,20 +1043,14 @@ __kthread_create_worker(int cpu, unsigned int flags, kthread_init_worker(worker); - if (cpu >= 0) - node = cpu_to_node(cpu); - task = __kthread_create_on_node(kthread_worker_fn, worker, - node, namefmt, args); + node, namefmt, args); if (IS_ERR(task)) goto fail_task; - if (cpu >= 0) - kthread_bind(task, cpu); - worker->flags = flags; worker->task = task; - wake_up_process(task); + return worker; fail_task: @@ -900,8 +1059,9 @@ fail_task: } /** - * kthread_create_worker - create a kthread worker + * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker + * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) @@ -909,25 +1069,26 @@ fail_task: * when the caller was killed by a fatal signal. */ struct kthread_worker * -kthread_create_worker(unsigned int flags, const char namefmt[], ...) +kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); - worker = __kthread_create_worker(-1, flags, namefmt, args); + worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); return worker; } -EXPORT_SYMBOL(kthread_create_worker); +EXPORT_SYMBOL(kthread_create_worker_on_node); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker - * @namefmt: printf-style name for the kthread worker (task). + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. @@ -959,14 +1120,13 @@ EXPORT_SYMBOL(kthread_create_worker); */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, - const char namefmt[], ...) + const char namefmt[]) { struct kthread_worker *worker; - va_list args; - va_start(args, namefmt); - worker = __kthread_create_worker(cpu, flags, namefmt, args); - va_end(args); + worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); + if (!IS_ERR(worker)) + kthread_bind(worker->task, cpu); return worker; } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 3c21c31796db..0cd39954d5a1 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -347,6 +347,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, * /sys/kernel/livepatch/<patch>/transition * /sys/kernel/livepatch/<patch>/force * /sys/kernel/livepatch/<patch>/replace + * /sys/kernel/livepatch/<patch>/stack_order * /sys/kernel/livepatch/<patch>/<object> * /sys/kernel/livepatch/<patch>/<object>/patched * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> @@ -452,15 +453,38 @@ static ssize_t replace_show(struct kobject *kobj, return sysfs_emit(buf, "%d\n", patch->replace); } +static ssize_t stack_order_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct klp_patch *patch, *this_patch; + int stack_order = 0; + + this_patch = container_of(kobj, struct klp_patch, kobj); + + mutex_lock(&klp_mutex); + + klp_for_each_patch(patch) { + stack_order++; + if (patch == this_patch) + break; + } + + mutex_unlock(&klp_mutex); + + return sysfs_emit(buf, "%d\n", stack_order); +} + static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); static struct kobj_attribute replace_kobj_attr = __ATTR_RO(replace); +static struct kobj_attribute stack_order_kobj_attr = __ATTR_RO(stack_order); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, &force_kobj_attr.attr, &replace_kobj_attr.attr, + &stack_order_kobj_attr.attr, NULL }; ATTRIBUTE_GROUPS(klp_patch); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2d8ec0351ef9..29acd238dad7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -157,10 +157,12 @@ static inline void lockdep_unlock(void) __this_cpu_dec(lockdep_recursion); } +#ifdef CONFIG_PROVE_LOCKING static inline bool lockdep_assert_locked(void) { return DEBUG_LOCKS_WARN_ON(__owner != current); } +#endif static struct task_struct *lockdep_selftest_task_struct; @@ -430,7 +432,7 @@ static inline u16 hlock_id(struct held_lock *hlock) return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS)); } -static inline unsigned int chain_hlock_class_idx(u16 hlock_id) +static inline __maybe_unused unsigned int chain_hlock_class_idx(u16 hlock_id) { return hlock_id & (MAX_LOCKDEP_KEYS - 1); } diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index bbe9000260d0..20f9ef58d3d0 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -119,7 +119,8 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) -#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) +#define AVG_LOCKDEP_CHAIN_DEPTH 5 +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS * AVG_LOCKDEP_CHAIN_DEPTH) extern struct lock_chain lock_chains[]; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index de95ec07e477..cc33470f4de9 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -106,7 +106,7 @@ static const struct kernel_param_ops lt_bind_ops = { module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); -long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); static struct task_struct *stats_task; static struct task_struct **writer_tasks; @@ -1358,7 +1358,7 @@ static int __init lock_torture_init(void) if (torture_init_error(firsterr)) goto unwind; if (cpumask_nonempty(bind_writers)) - torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers); + torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers, true); create_reader: if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) @@ -1369,7 +1369,7 @@ static int __init lock_torture_init(void) if (torture_init_error(firsterr)) goto unwind; if (cpumask_nonempty(bind_readers)) - torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers); + torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers, true); } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 3302e52f0c96..b36f23de48f1 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -657,10 +657,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err; } - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - /* Make sure we do wakeups before calling schedule */ - wake_up_q(&wake_q); - wake_q_init(&wake_q); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); schedule_preempt_disabled(); @@ -710,8 +707,7 @@ skip_wait: if (ww_ctx) ww_mutex_lock_acquired(ww, ww_ctx); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); preempt_enable(); return 0; @@ -720,10 +716,9 @@ err: __mutex_remove_waiter(lock, &waiter); err_early_kill: trace_contention_end(lock, ret); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, ip); - wake_up_q(&wake_q); preempt_enable(); return ret; } @@ -935,10 +930,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne if (owner & MUTEX_FLAG_HANDOFF) __mutex_handoff(lock, next); - preempt_disable(); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); } #ifndef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 697a56d3d949..4a8df1800cbb 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1292,13 +1292,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, */ get_task_struct(owner); - preempt_disable(); - raw_spin_unlock_irq(&lock->wait_lock); - /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */ - wake_up_q(wake_q); - wake_q_init(wake_q); - preempt_enable(); - + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); @@ -1642,13 +1636,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = rt_mutex_owner(lock); else owner = NULL; - preempt_disable(); - raw_spin_unlock_irq(&lock->wait_lock); - if (wake_q) { - wake_up_q(wake_q); - wake_q_init(wake_q); - } - preempt_enable(); + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) rt_mutex_schedule(); @@ -1799,10 +1787,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, */ raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q); - preempt_disable(); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); rt_mutex_post_schedule(); return ret; @@ -1860,11 +1845,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, owner = rt_mutex_owner(lock); else owner = NULL; - preempt_disable(); - raw_spin_unlock_irq(&lock->wait_lock); - wake_up_q(wake_q); - wake_q_init(wake_q); - preempt_enable(); + raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q); if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) schedule_rtlock(); @@ -1893,10 +1874,7 @@ static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) raw_spin_lock_irqsave(&lock->wait_lock, flags); rtlock_slowlock_locked(lock, &wake_q); - preempt_disable(); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); - preempt_enable(); + raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); } #endif /* RT_MUTEX_BUILD_SPINLOCKS */ diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 5d58b2c0ef98..bcb1b9fea588 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -404,7 +404,7 @@ static inline u32 prandom_u32_below(u32 ceil) static int *get_random_order(int count) { int *order; - int n, r, tmp; + int n, r; order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); if (!order) @@ -415,11 +415,8 @@ static int *get_random_order(int count) for (n = count - 1; n > 1; n--) { r = prandom_u32_below(n + 1); - if (r != n) { - tmp = order[n]; - order[n] = order[r]; - order[r] = tmp; - } + if (r != n) + swap(order[n], order[r]); } return order; diff --git a/kernel/pid.c b/kernel/pid.c index 115448e89c3e..3a10a7b6fcf8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -43,6 +43,7 @@ #include <linux/sched/task.h> #include <linux/idr.h> #include <linux/pidfs.h> +#include <linux/seqlock.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> @@ -60,15 +61,8 @@ struct pid init_struct_pid = { }, } }; -int pid_max = PID_MAX_DEFAULT; - -int pid_max_min = RESERVED_PIDS + 1; -int pid_max_max = PID_MAX_LIMIT; -/* - * Pseudo filesystems start inode numbering after one. We use Reserved - * PIDs as a natural offset. - */ -static u64 pidfs_ino = RESERVED_PIDS; +static int pid_max_min = RESERVED_PIDS + 1; +static int pid_max_max = PID_MAX_LIMIT; /* * PID-map pages start out as NULL, they get allocated upon @@ -87,6 +81,7 @@ struct pid_namespace init_pid_ns = { #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif + .pid_max = PID_MAX_DEFAULT, #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif @@ -108,6 +103,7 @@ EXPORT_SYMBOL_GPL(init_pid_ns); */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); +seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); void put_pid(struct pid *pid) { @@ -158,6 +154,7 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } + pidfs_remove_pid(pid); spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); @@ -193,6 +190,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, for (i = ns->level; i >= 0; i--) { int tid = 0; + int pid_max = READ_ONCE(tmp->pid_max); if (set_tid_size) { tid = set_tid[ns->level - i]; @@ -273,22 +271,24 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; + idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; - pid->stashed = NULL; - pid->ino = ++pidfs_ino; + pidfs_add_pid(pid); for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); + idr_preload_end(); return pid; out_unlock: spin_unlock_irq(&pidmap_lock); + idr_preload_end(); put_pid_ns(ns); out_free: @@ -644,17 +644,118 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) return fd; } +#ifdef CONFIG_SYSCTL +static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root) +{ + return &task_active_pid_ns(current)->set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return &task_active_pid_ns(current)->set == set; +} + +static int pid_table_root_permissions(struct ctl_table_header *head, + const struct ctl_table *table) +{ + struct pid_namespace *pidns = + container_of(head->set, struct pid_namespace, set); + int mode = table->mode; + + if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) || + uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) + mode = (mode & S_IRWXU) >> 6; + else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) + mode = (mode & S_IRWXG) >> 3; + else + mode = mode & S_IROTH; + return (mode << 6) | (mode << 3) | mode; +} + +static void pid_table_root_set_ownership(struct ctl_table_header *head, + kuid_t *uid, kgid_t *gid) +{ + struct pid_namespace *pidns = + container_of(head->set, struct pid_namespace, set); + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + ns_root_uid = make_kuid(pidns->user_ns, 0); + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + ns_root_gid = make_kgid(pidns->user_ns, 0); + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; +} + +static struct ctl_table_root pid_table_root = { + .lookup = pid_table_root_lookup, + .permissions = pid_table_root_permissions, + .set_ownership = pid_table_root_set_ownership, +}; + +static struct ctl_table pid_table[] = { + { + .procname = "pid_max", + .data = &init_pid_ns.pid_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, +}; +#endif + +int register_pidns_sysctls(struct pid_namespace *pidns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen); + + tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL); + if (!tbl) + return -ENOMEM; + tbl->data = &pidns->pid_max; + pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + + pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl, + ARRAY_SIZE(pid_table)); + if (!pidns->sysctls) { + kfree(tbl); + retire_sysctl_set(&pidns->set); + return -ENOMEM; + } +#endif + return 0; +} + +void unregister_pidns_sysctls(struct pid_namespace *pidns) +{ +#ifdef CONFIG_SYSCTL + const struct ctl_table *tbl; + + tbl = pidns->sysctls->ctl_table_arg; + unregister_sysctl_table(pidns->sysctls); + retire_sysctl_set(&pidns->set); + kfree(tbl); +#endif +} + void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ - pid_max = min(pid_max_max, max_t(int, pid_max, - PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); pid_max_min = max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus()); - pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); + pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min); idr_init(&init_pid_ns.idr); @@ -665,6 +766,16 @@ void __init pid_idr_init(void) NULL); } +static __init int pid_namespace_sysctl_init(void) +{ +#ifdef CONFIG_SYSCTL + /* "kernel" directory will have already been initialized. */ + BUG_ON(register_pidns_sysctls(&init_pid_ns)); +#endif + return 0; +} +subsys_initcall(pid_namespace_sysctl_init); + static struct file *__pidfd_fget(struct task_struct *task, int fd) { struct file *file; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index d70ab49d5b4a..f1ffa032fc32 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -70,6 +70,8 @@ static void dec_pid_namespaces(struct ucounts *ucounts) dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); } +static void destroy_pid_namespace_work(struct work_struct *work); + static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, struct pid_namespace *parent_pid_ns) { @@ -105,17 +107,27 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns goto out_free_idr; ns->ns.ops = &pidns_operations; + ns->pid_max = parent_pid_ns->pid_max; + err = register_pidns_sysctls(ns); + if (err) + goto out_free_inum; + refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; + INIT_WORK(&ns->work, destroy_pid_namespace_work); + #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif + return ns; +out_free_inum: + ns_free_inum(&ns->ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -137,12 +149,28 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { + unregister_pidns_sysctls(ns); + ns_free_inum(&ns->ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } +static void destroy_pid_namespace_work(struct work_struct *work) +{ + struct pid_namespace *ns = + container_of(work, struct pid_namespace, work); + + do { + struct pid_namespace *parent; + + parent = ns->parent; + destroy_pid_namespace(ns); + ns = parent; + } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); +} + struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { @@ -155,15 +183,8 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, void put_pid_ns(struct pid_namespace *ns) { - struct pid_namespace *parent; - - while (ns != &init_pid_ns) { - parent = ns->parent; - if (!refcount_dec_and_test(&ns->ns.count)) - break; - destroy_pid_namespace(ns); - ns = parent; - } + if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) + schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); @@ -274,6 +295,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, next = idr_get_cursor(&pid_ns->idr) - 1; tmp.data = &next; + tmp.extra2 = &pid_ns->pid_max; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (!ret && write) idr_set_cursor(&pid_ns->idr, next + 1); @@ -281,7 +303,6 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write, return ret; } -extern int pid_max; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", @@ -289,7 +310,7 @@ static struct ctl_table pid_ns_ctl_table[] = { .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &pid_max, + .extra2 = &init_pid_ns.pid_max, }, }; #endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index afce8130d8b9..ca947ed32e3d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -257,11 +257,30 @@ config DPM_WATCHDOG boot session. config DPM_WATCHDOG_TIMEOUT - int "Watchdog timeout in seconds" + int "Watchdog timeout to panic in seconds" range 1 120 default 120 depends on DPM_WATCHDOG +config DPM_WATCHDOG_WARNING_TIMEOUT + int "Watchdog timeout to warn in seconds" + range 1 DPM_WATCHDOG_TIMEOUT + default DPM_WATCHDOG_TIMEOUT + depends on DPM_WATCHDOG + help + If the DPM watchdog warning timeout and main timeout are + different then a non-fatal warning (with a stack trace of + the stuck suspend routine) will be printed when the warning + timeout expires. If the suspend routine gets un-stuck + before the main timeout expires then no other action is + taken. If the routine continues to be stuck and the main + timeout expires then an emergency-level message and stack + trace will be printed and the system will panic. + + If the warning timeout is equal to the main timeout (the + default) then the warning will never happen and the system + will jump straight to panic when the main timeout expires. + config PM_TRACE bool help diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index b29c8aca7486..865df641b97c 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -9,7 +9,6 @@ #include <linux/device.h> #include <linux/mutex.h> -#include <linux/pm_wakeup.h> #include "power.h" diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d07faf42eace..3874f0e97651 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -908,3 +908,20 @@ int em_update_performance_limits(struct em_perf_domain *pd, return 0; } EXPORT_SYMBOL_GPL(em_update_performance_limits); + +static void rebuild_sd_workfn(struct work_struct *work) +{ + rebuild_sched_domains_energy(); +} + +void em_rebuild_sched_domains(void) +{ + static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); + + /* + * When called from the cpufreq_register_driver() path, the + * cpu_hotplug_lock is already held, so use a work item to + * avoid nested locking in rebuild_sched_domains(). + */ + schedule_work(&rebuild_sd_work); +} diff --git a/kernel/power/power.h b/kernel/power/power.h index de0e6b1077f2..c352dea2f67b 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -110,7 +110,7 @@ extern int hibernate_preallocate_memory(void); extern void clear_or_poison_free_pages(void); -/** +/* * Auxiliary structure used for reading the snapshot image data and * metadata from and writing them to the list of page backup entries * (PBEs) which is the main data structure of swsusp. diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index c6bb47666aef..a91bdf802967 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -338,3 +338,9 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped); void console_prepend_replay(struct printk_message *pmsg); #endif + +#ifdef CONFIG_SMP +bool is_printk_cpu_sync_owner(void); +#else +static inline bool is_printk_cpu_sync_owner(void) { return false; } +#endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 80910bc3470c..07668433644b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -523,7 +523,7 @@ static struct latched_seq clear_seq = { /* record buffer */ #define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) -#define LOG_BUF_LEN_MAX (u32)(1 << 31) +#define LOG_BUF_LEN_MAX ((u32)1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -4922,6 +4922,11 @@ void console_try_replay_all(void) static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); +bool is_printk_cpu_sync_owner(void) +{ + return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id()); +} + /** * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant * spinning lock is not owned by any CPU. diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 6f94418d53ff..32a28f563b13 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -61,10 +61,15 @@ bool is_printk_legacy_deferred(void) /* * The per-CPU variable @printk_context can be read safely in any * context. CPU migration is always disabled when set. + * + * A context holding the printk_cpu_sync must not spin waiting for + * another CPU. For legacy printing, it could be the console_lock + * or the port lock. */ return (force_legacy_kthread() || this_cpu_read(printk_context) || - in_nmi()); + in_nmi() || + is_printk_cpu_sync_owner()); } asmlinkage int vprintk(const char *fmt, va_list args) @@ -74,15 +79,6 @@ asmlinkage int vprintk(const char *fmt, va_list args) if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); #endif - - /* - * Use the main logbuf even in NMI. But avoid calling console - * drivers that might have their own locks. - */ - if (is_printk_legacy_deferred()) - return vprintk_deferred(fmt, args); - - /* No obstacles. */ return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk); diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 9b0b52e1836f..6af90510a1ca 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -53,6 +53,37 @@ config RCU_TORTURE_TEST Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. +config RCU_TORTURE_TEST_CHK_RDR_STATE + tristate "Check rcutorture reader state" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to check the desired rcutorture + reader state for each segment against the actual context. + Note that PREEMPT_COUNT must be enabled if the preempt-disabled + and bh-disabled checks are to take effect, and that PREEMPT_RCU + must be enabled for the RCU-nesting checks to take effect. + These checks add overhead, and this Kconfig options is therefore + disabled by default. + + Say Y here if you want rcutorture reader contexts checked. + Say N if you are unsure. + +config RCU_TORTURE_TEST_LOG_CPU + tristate "Log CPU for rcutorture failures" + depends on RCU_TORTURE_TEST + default n + help + This option causes rcutorture to decorate each entry of its + log of failure/close-call rcutorture reader segments with the + number of the CPU that the reader was running on at the time. + This information can be useful, but it does incur additional + overhead, overhead that can make both failures and close calls + less probable. + + Say Y here if you want CPU IDs logged. + Say N if you are unsure. + config RCU_REF_SCALE_TEST tristate "Scalability tests for read-side synchronization (RCU and others)" depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 612d27690335..d26fb1d33ed9 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -92,12 +92,20 @@ torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives"); torture_param(bool, gp_cond_exp_full, false, "Use conditional/async full-stateexpedited GP wait primitives"); +torture_param(int, gp_cond_wi, 16 * USEC_PER_SEC / HZ, + "Wait interval for normal conditional grace periods, us (default 16 jiffies)"); +torture_param(int, gp_cond_wi_exp, 128, + "Wait interval for expedited conditional grace periods, us (default 128 us)"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives"); torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives"); +torture_param(int, gp_poll_wi, 16 * USEC_PER_SEC / HZ, + "Wait interval for normal polled grace periods, us (default 16 jiffies)"); +torture_param(int, gp_poll_wi_exp, 128, + "Wait interval for expedited polled grace periods, us (default 128 us)"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -109,9 +117,11 @@ torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); +torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable"); +torture_param(int, preempt_interval, MSEC_PER_SEC, "Interval between preemptions (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable"); -torture_param(int, reader_flavor, 0x1, "Reader flavors to use, one per bit."); +torture_param(int, reader_flavor, SRCU_READ_FLAVOR_NORMAL, "Reader flavors to use, one per bit."); torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles"); torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); @@ -149,6 +159,7 @@ static struct task_struct **fwd_prog_tasks; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; static struct task_struct *read_exit_task; +static struct task_struct *preempt_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -259,10 +270,13 @@ struct rt_read_seg { unsigned long rt_delay_ms; unsigned long rt_delay_us; bool rt_preempted; + int rt_cpu; + int rt_end_cpu; }; static int err_segs_recorded; static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS]; static int rt_read_nsegs; +static int rt_read_preempted; static const char *rcu_torture_writer_state_getname(void) { @@ -353,7 +367,8 @@ struct rcu_torture_ops { void (*read_delay)(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp); void (*readunlock)(int idx); - int (*readlock_held)(void); + int (*readlock_held)(void); // lockdep. + int (*readlock_nesting)(void); // actual nesting, if available, -1 if not. unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); @@ -390,6 +405,7 @@ struct rcu_torture_ops { void (*get_gp_data)(int *flags, unsigned long *gp_seq); void (*gp_slow_register)(atomic_t *rgssp); void (*gp_slow_unregister)(atomic_t *rgssp); + bool (*reader_blocked)(void); long cbflood_max; int irq_capable; int can_boost; @@ -448,10 +464,8 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) rtrsp->rt_delay_us = shortdelay_us; } if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 500))) { + !(torture_random(rrsp) % (nrealreaders * 500))) torture_preempt_schedule(); /* QS only if preemptible. */ - rtrsp->rt_preempted = true; - } } static void rcu_torture_read_unlock(int idx) @@ -459,6 +473,15 @@ static void rcu_torture_read_unlock(int idx) rcu_read_unlock(); } +static int rcu_torture_readlock_nesting(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + return rcu_preempt_depth(); + if (IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return (preempt_count() & PREEMPT_MASK); + return -1; +} + /* * Update callback in the pipe. This should be invoked after a grace period. */ @@ -548,6 +571,7 @@ static struct rcu_torture_ops rcu_ops = { .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .readlock_held = torture_readlock_not_held, + .readlock_nesting = rcu_torture_readlock_nesting, .get_gp_seq = rcu_get_gp_seq, .gp_diff = rcu_seq_diff, .deferred_free = rcu_torture_deferred_free, @@ -573,6 +597,7 @@ static struct rcu_torture_ops rcu_ops = { .start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full, .poll_gp_state_exp = poll_state_synchronize_rcu, .cond_sync_exp = cond_synchronize_rcu_expedited, + .cond_sync_exp_full = cond_synchronize_rcu_expedited_full, .call = call_rcu_hurry, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, @@ -582,6 +607,9 @@ static struct rcu_torture_ops rcu_ops = { .get_gp_data = rcutorture_get_gp_data, .gp_slow_register = rcu_gp_slow_register, .gp_slow_unregister = rcu_gp_slow_unregister, + .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU) + ? has_rcu_reader_blocked + : NULL, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -628,6 +656,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, .irq_capable = 1, + .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted" }; @@ -650,17 +679,17 @@ static int srcu_torture_read_lock(void) int idx; int ret = 0; - if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) { + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { idx = srcu_read_lock(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); ret += idx; } - if (reader_flavor & 0x2) { + if (reader_flavor & SRCU_READ_FLAVOR_NMI) { idx = srcu_read_lock_nmisafe(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); ret += idx << 1; } - if (reader_flavor & 0x4) { + if (reader_flavor & SRCU_READ_FLAVOR_LITE) { idx = srcu_read_lock_lite(srcu_ctlp); WARN_ON_ONCE(idx & ~0x1); ret += idx << 2; @@ -690,11 +719,11 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) static void srcu_torture_read_unlock(int idx) { WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); - if (reader_flavor & 0x4) + if (reader_flavor & SRCU_READ_FLAVOR_LITE) srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); - if (reader_flavor & 0x2) + if (reader_flavor & SRCU_READ_FLAVOR_NMI) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); - if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) srcu_read_unlock(srcu_ctlp, idx & 0x1); } @@ -857,7 +886,7 @@ static void synchronize_rcu_trivial(void) int cpu; for_each_online_cpu(cpu) { - torture_sched_setaffinity(current->pid, cpumask_of(cpu)); + torture_sched_setaffinity(current->pid, cpumask_of(cpu), true); WARN_ON_ONCE(raw_smp_processor_id() != cpu); } } @@ -1347,6 +1376,7 @@ static void rcu_torture_write_types(void) pr_alert("%s: gp_sync without primitives.\n", __func__); } pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes); + pr_info("%s: gp_cond_wi %d gp_cond_wi_exp %d gp_poll_wi %d gp_poll_wi_exp %d\n", __func__, gp_cond_wi, gp_cond_wi_exp, gp_poll_wi, gp_poll_wi_exp); } /* @@ -1513,7 +1543,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET: rcu_torture_writer_state = RTWS_COND_GET; gp_snap = cur_ops->get_gp_state(); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC; cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); @@ -1521,7 +1552,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_EXP: rcu_torture_writer_state = RTWS_COND_GET_EXP; gp_snap = cur_ops->get_gp_state_exp(); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_EXP; cur_ops->cond_sync_exp(gp_snap); rcu_torture_pipe_update(old_rp); @@ -1529,7 +1561,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_FULL: rcu_torture_writer_state = RTWS_COND_GET_FULL; cur_ops->get_gp_state_full(&gp_snap_full); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_FULL; cur_ops->cond_sync_full(&gp_snap_full); rcu_torture_pipe_update(old_rp); @@ -1537,7 +1570,8 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET_EXP_FULL: rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL; cur_ops->get_gp_state_full(&gp_snap_full); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp, + 1000, &rand); rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL; cur_ops->cond_sync_exp_full(&gp_snap_full); rcu_torture_pipe_update(old_rp); @@ -1557,8 +1591,8 @@ rcu_torture_writer(void *arg) break; } WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi, + 1000, &rand); } rcu_torture_pipe_update(old_rp); break; @@ -1578,8 +1612,8 @@ rcu_torture_writer(void *arg) break; } WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size); - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi, + 1000, &rand); } rcu_torture_pipe_update(old_rp); break; @@ -1588,8 +1622,8 @@ rcu_torture_writer(void *arg) gp_snap = cur_ops->start_gp_poll_exp(); rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; while (!cur_ops->poll_gp_state_exp(gp_snap)) - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp, + 1000, &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_POLL_GET_EXP_FULL: @@ -1597,8 +1631,8 @@ rcu_torture_writer(void *arg) cur_ops->start_gp_poll_exp_full(&gp_snap_full); rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL; while (!cur_ops->poll_gp_state_full(&gp_snap_full)) - torture_hrtimeout_jiffies(torture_random(&rand) % 16, - &rand); + torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp, + 1000, &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_SYNC: @@ -1835,6 +1869,44 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign. } +// Verify the specified RCUTORTURE_RDR* state. +#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() +static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) +{ + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) + return; + + WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS); + WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS); + + // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable. + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && + !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + !(preempt_count() & PREEMPT_MASK), ROEC_ARGS); + WARN_ONCE(cur_ops->readlock_nesting && + (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + cur_ops->readlock_nesting() == 0, ROEC_ARGS); + + // Timer handlers have all sorts of stuff disabled, so ignore + // unintended disabling. + if (insoftirq) + return; + + WARN_ONCE(cur_ops->extendables && + !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && + (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + WARN_ONCE(cur_ops->extendables && + !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && + (preempt_count() & PREEMPT_MASK), ROEC_ARGS); + WARN_ONCE(cur_ops->readlock_nesting && + !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && + cur_ops->readlock_nesting() > 0, ROEC_ARGS); +} + /* * Do one extension of an RCU read-side critical section using the * current reader state in readstate (set to zero for initial entry @@ -1844,10 +1916,11 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, * beginning or end of the critical section and if there was actually a * change, do a ->read_delay(). */ -static void rcutorture_one_extend(int *readstate, int newstate, +static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { + bool first; unsigned long flags; int idxnew1 = -1; int idxnew2 = -1; @@ -1856,8 +1929,10 @@ static void rcutorture_one_extend(int *readstate, int newstate, int statesnew = ~*readstate & newstate; int statesold = *readstate & ~newstate; + first = idxold1 == 0; WARN_ON_ONCE(idxold2 < 0); WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); + rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -1876,6 +1951,21 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (statesnew & RCUTORTURE_RDR_RCU_2) idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; + // Complain unless both the old and the new protection is in place. + rcutorture_one_extend_check("during change", + idxold1 | statesnew, statesnew, statesold, insoftirq); + + // Sample CPU under both sets of protections to reduce confusion. + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { + int cpu = raw_smp_processor_id(); + rtrsp->rt_cpu = cpu; + if (!first) { + rtrsp[-1].rt_end_cpu = cpu; + if (cur_ops->reader_blocked) + rtrsp[-1].rt_preempted = cur_ops->reader_blocked(); + } + } + /* * Next, remove old protection, in decreasing order of strength * to avoid unlock paths that aren't safe in the stronger @@ -1926,6 +2016,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, WARN_ON_ONCE(*readstate < 0); if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) pr_info("Unexpected readstate value of %#x\n", *readstate); + rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1992,7 +2083,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * critical section. */ static struct rt_read_seg * -rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, +rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { int i; @@ -2007,7 +2098,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); - rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); + rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]); } return &rtrsp[j]; } @@ -2028,6 +2119,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) int newstate; struct rcu_torture *p; int pipe_count; + bool preempted = false; int readstate = 0; struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } }; struct rt_read_seg *rtrsp = &rtseg[0]; @@ -2036,7 +2128,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); - rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); + rcutorture_one_extend(&readstate, newstate, myid < 0, trsp, rtrsp++); if (checkpolling) { if (cur_ops->get_gp_state && cur_ops->poll_gp_state) cookie = cur_ops->get_gp_state(); @@ -2049,13 +2141,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) !cur_ops->readlock_held || cur_ops->readlock_held()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&readstate, 0, trsp, rtrsp); + rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); return false; } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); rcu_torture_reader_do_mbchk(myid, p, trsp); - rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); + rtrsp = rcutorture_loop_extend(&readstate, myid < 0, trsp, rtrsp); preempt_disable(); pipe_count = READ_ONCE(p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { @@ -2093,7 +2185,9 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) rcu_torture_writer_state, cpumask_pr_args(cpu_online_mask)); } - rcutorture_one_extend(&readstate, 0, trsp, rtrsp); + if (cur_ops->reader_blocked) + preempted = cur_ops->reader_blocked(); + rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp); WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. @@ -2105,6 +2199,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++) err_segs[i++] = *rtrsp1; rt_read_nsegs = i; + rt_read_preempted = preempted; } return true; @@ -2425,7 +2520,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "read_exit_delay=%d read_exit_burst=%d " "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " - "test_nmis=%d\n", + "test_nmis=%d " + "preempt_duration=%d preempt_interval=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -2438,7 +2534,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) read_exit_delay, read_exit_burst, reader_flavor, nocbs_nthreads, nocbs_toggle, - test_nmis); + test_nmis, + preempt_duration, preempt_interval); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -3068,12 +3165,12 @@ static int __init rcu_torture_fwd_prog_init(void) fwd_progress = 0; return 0; } - if (stall_cpu > 0) { - VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); + if (stall_cpu > 0 || (preempt_duration > 0 && IS_ENABLED(CONFIG_RCU_NOCB_CPU))) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall and/or preemption testing"); fwd_progress = 0; if (IS_MODULE(CONFIG_RCU_TORTURE_TEST)) return -EINVAL; /* In module, can fail back to user. */ - WARN_ON(1); /* Make sure rcutorture notices conflict. */ + WARN_ON(1); /* Make sure rcutorture scripting notices conflict. */ return 0; } if (fwd_progress_holdoff <= 0) @@ -3418,6 +3515,35 @@ static void rcutorture_test_nmis(int n) #endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST) } +// Randomly preempt online CPUs. +static int rcu_torture_preempt(void *unused) +{ + int cpu = -1; + DEFINE_TORTURE_RANDOM(rand); + + schedule_timeout_idle(stall_cpu_holdoff); + do { + // Wait for preempt_interval ms with up to 100us fuzz. + torture_hrtimeout_ms(preempt_interval, 100, &rand); + // Select online CPU. + cpu = cpumask_next(cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_next(-1, cpu_online_mask); + WARN_ON_ONCE(cpu >= nr_cpu_ids); + // Move to that CPU, if can't do so, retry later. + if (torture_sched_setaffinity(current->pid, cpumask_of(cpu), false)) + continue; + // Preempt at high-ish priority, then reset to normal. + sched_set_fifo(current); + torture_sched_setaffinity(current->pid, cpu_present_mask, true); + mdelay(preempt_duration); + sched_set_normal(current, 0); + stutter_wait("rcu_torture_preempt"); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_torture_preempt"); + return 0; +} + static enum cpuhp_state rcutor_hp; static void @@ -3446,6 +3572,7 @@ rcu_torture_cleanup(void) if (cur_ops->gp_kthread_dbg) cur_ops->gp_kthread_dbg(); + torture_stop_kthread(rcu_torture_preempt, preempt_task); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); rcu_torture_fwd_prog_cleanup(); @@ -3508,26 +3635,49 @@ rcu_torture_cleanup(void) pr_alert("\t: No segments recorded!!!\n"); firsttime = 1; for (i = 0; i < rt_read_nsegs; i++) { - pr_alert("\t%d: %#x ", i, err_segs[i].rt_readstate); + pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate); if (err_segs[i].rt_delay_jiffies != 0) { pr_cont("%s%ldjiffies", firsttime ? "" : "+", err_segs[i].rt_delay_jiffies); firsttime = 0; } + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { + pr_cont(" CPU %2d", err_segs[i].rt_cpu); + if (err_segs[i].rt_cpu != err_segs[i].rt_end_cpu) + pr_cont("->%-2d", err_segs[i].rt_end_cpu); + else + pr_cont(" ..."); + } if (err_segs[i].rt_delay_ms != 0) { - pr_cont("%s%ldms", firsttime ? "" : "+", + pr_cont(" %s%ldms", firsttime ? "" : "+", err_segs[i].rt_delay_ms); firsttime = 0; } if (err_segs[i].rt_delay_us != 0) { - pr_cont("%s%ldus", firsttime ? "" : "+", + pr_cont(" %s%ldus", firsttime ? "" : "+", err_segs[i].rt_delay_us); firsttime = 0; } - pr_cont("%s\n", - err_segs[i].rt_preempted ? "preempted" : ""); + pr_cont("%s", err_segs[i].rt_preempted ? " preempted" : ""); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_BH) + pr_cont(" BH"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_IRQ) + pr_cont(" IRQ"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_PREEMPT) + pr_cont(" PREEMPT"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RBH) + pr_cont(" RBH"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_SCHED) + pr_cont(" SCHED"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_1) + pr_cont(" RCU_1"); + if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_2) + pr_cont(" RCU_2"); + pr_cont("\n"); } + if (rt_read_preempted) + pr_alert("\tReader was preempted.\n"); } if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); @@ -4019,6 +4169,11 @@ rcu_torture_init(void) firsterr = rcu_torture_read_exit_init(); if (torture_init_error(firsterr)) goto unwind; + if (preempt_duration > 0) { + firsterr = torture_create_kthread(rcu_torture_preempt, NULL, preempt_task); + if (torture_init_error(firsterr)) + goto unwind; + } if (object_debug) rcu_test_debug_objects(); torture_init_end(); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index aacfcc9838b3..1b47376acdc4 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -36,6 +36,7 @@ #include <linux/slab.h> #include <linux/torture.h> #include <linux/types.h> +#include <linux/sched/clock.h> #include "rcu.h" @@ -531,6 +532,39 @@ static const struct ref_scale_ops acqrel_ops = { static volatile u64 stopopts; +static void ref_sched_clock_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += sched_clock(); + preempt_enable(); + stopopts = x; +} + +static void ref_sched_clock_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += sched_clock(); + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static const struct ref_scale_ops sched_clock_ops = { + .readsection = ref_sched_clock_section, + .delaysection = ref_sched_clock_delay_section, + .name = "sched-clock" +}; + + static void ref_clock_section(const int nloops) { u64 x = 0; @@ -1130,9 +1164,9 @@ ref_scale_init(void) int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS - &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, - &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, - &typesafe_seqlock_ops, + &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, + &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, + &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; if (!torture_init_begin(scale_type, verbose)) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5e2e53464794..b83c74c4dcc0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -738,7 +738,8 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); /* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. - * Returns an index that must be passed to the matching srcu_read_unlock(). + * Returns a guaranteed non-negative index that must be passed to the + * matching __srcu_read_unlock(). */ int __srcu_read_lock(struct srcu_struct *ssp) { @@ -1076,7 +1077,6 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, /* If grace period not already in progress, start it. */ if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) { - WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed)); srcu_gp_start(ssp); // And how can that list_add() in the "else" clause diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ff98233d4aa5..2795d6b5109c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -149,7 +149,6 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static struct task_struct *rcu_boost_task(struct rcu_node *rnp); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -186,26 +185,6 @@ static int rcu_unlock_delay; module_param(rcu_unlock_delay, int, 0444); #endif -/* - * This rcu parameter is runtime-read-only. It reflects - * a minimum allowed number of objects which can be cached - * per-CPU. Object size is equal to one page. This value - * can be changed at boot time. - */ -static int rcu_min_cached_objs = 5; -module_param(rcu_min_cached_objs, int, 0444); - -// A page shrinker can ask for pages to be freed to make them -// available for other parts of the system. This usually happens -// under low memory conditions, and in that case we should also -// defer page-cache filling for a short time period. -// -// The default value is 5 seconds, which is long enough to reduce -// interference with the shrinker while it asks other systems to -// drain their caches. -static int rcu_delay_page_cache_fill_msec = 5000; -module_param(rcu_delay_page_cache_fill_msec, int, 0444); - /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { @@ -3084,8 +3063,11 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) head->func = func; head->next = NULL; kasan_record_aux_stack_noalloc(head); + local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); + RCU_LOCKDEP_WARN(!rcu_rdp_cpu_online(rdp), "Callback enqueued on offline CPU!"); + lazy = lazy_in && !rcu_async_should_hurry(); /* Add the callback to our list. */ @@ -3191,812 +3173,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu); -/* Maximum number of jiffies to wait before draining a batch. */ -#define KFREE_DRAIN_JIFFIES (5 * HZ) -#define KFREE_N_BATCHES 2 -#define FREE_N_CHANNELS 2 - -/** - * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers - * @list: List node. All blocks are linked between each other - * @gp_snap: Snapshot of RCU state for objects placed to this bulk - * @nr_records: Number of active pointers in the array - * @records: Array of the kvfree_rcu() pointers - */ -struct kvfree_rcu_bulk_data { - struct list_head list; - struct rcu_gp_oldstate gp_snap; - unsigned long nr_records; - void *records[] __counted_by(nr_records); -}; - -/* - * This macro defines how many entries the "records" array - * will contain. It is based on the fact that the size of - * kvfree_rcu_bulk_data structure becomes exactly one page. - */ -#define KVFREE_BULK_MAX_ENTR \ - ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) - -/** - * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests - * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period - * @head_free: List of kfree_rcu() objects waiting for a grace period - * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. - * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period - * @krcp: Pointer to @kfree_rcu_cpu structure - */ - -struct kfree_rcu_cpu_work { - struct rcu_work rcu_work; - struct rcu_head *head_free; - struct rcu_gp_oldstate head_free_gp_snap; - struct list_head bulk_head_free[FREE_N_CHANNELS]; - struct kfree_rcu_cpu *krcp; -}; - -/** - * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period - * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" - * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period - * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period - * @lock: Synchronize access to this structure - * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES - * @initialized: The @rcu_work fields have been initialized - * @head_count: Number of objects in rcu_head singular list - * @bulk_count: Number of objects in bulk-list - * @bkvcache: - * A simple cache list that contains objects for reuse purpose. - * In order to save some per-cpu space the list is singular. - * Even though it is lockless an access has to be protected by the - * per-cpu lock. - * @page_cache_work: A work to refill the cache when it is empty - * @backoff_page_cache_fill: Delay cache refills - * @work_in_progress: Indicates that page_cache_work is running - * @hrtimer: A hrtimer for scheduling a page_cache_work - * @nr_bkv_objs: number of allocated objects at @bkvcache. - * - * This is a per-CPU structure. The reason that it is not included in - * the rcu_data structure is to permit this code to be extracted from - * the RCU files. Such extraction could allow further optimization of - * the interactions with the slab allocators. - */ -struct kfree_rcu_cpu { - // Objects queued on a linked list - // through their rcu_head structures. - struct rcu_head *head; - unsigned long head_gp_snap; - atomic_t head_count; - - // Objects queued on a bulk-list. - struct list_head bulk_head[FREE_N_CHANNELS]; - atomic_t bulk_count[FREE_N_CHANNELS]; - - struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; - raw_spinlock_t lock; - struct delayed_work monitor_work; - bool initialized; - - struct delayed_work page_cache_work; - atomic_t backoff_page_cache_fill; - atomic_t work_in_progress; - struct hrtimer hrtimer; - - struct llist_head bkvcache; - int nr_bkv_objs; -}; - -static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), -}; - -static __always_inline void -debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) -{ -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - int i; - - for (i = 0; i < bhead->nr_records; i++) - debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); -#endif -} - -static inline struct kfree_rcu_cpu * -krc_this_cpu_lock(unsigned long *flags) -{ - struct kfree_rcu_cpu *krcp; - - local_irq_save(*flags); // For safely calling this_cpu_ptr(). - krcp = this_cpu_ptr(&krc); - raw_spin_lock(&krcp->lock); - - return krcp; -} - -static inline void -krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) -{ - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - -static inline struct kvfree_rcu_bulk_data * -get_cached_bnode(struct kfree_rcu_cpu *krcp) -{ - if (!krcp->nr_bkv_objs) - return NULL; - - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1); - return (struct kvfree_rcu_bulk_data *) - llist_del_first(&krcp->bkvcache); -} - -static inline bool -put_cached_bnode(struct kfree_rcu_cpu *krcp, - struct kvfree_rcu_bulk_data *bnode) -{ - // Check the limit. - if (krcp->nr_bkv_objs >= rcu_min_cached_objs) - return false; - - llist_add((struct llist_node *) bnode, &krcp->bkvcache); - WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1); - return true; -} - -static int -drain_page_cache(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - struct llist_node *page_list, *pos, *n; - int freed = 0; - - if (!rcu_min_cached_objs) - return 0; - - raw_spin_lock_irqsave(&krcp->lock, flags); - page_list = llist_del_all(&krcp->bkvcache); - WRITE_ONCE(krcp->nr_bkv_objs, 0); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - llist_for_each_safe(pos, n, page_list) { - free_page((unsigned long)pos); - freed++; - } - - return freed; -} - -static void -kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, - struct kvfree_rcu_bulk_data *bnode, int idx) -{ - unsigned long flags; - int i; - - if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) { - debug_rcu_bhead_unqueue(bnode); - rcu_lock_acquire(&rcu_callback_map); - if (idx == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bnode->nr_records, - bnode->records); - - kfree_bulk(bnode->nr_records, bnode->records); - } else { // vmalloc() / vfree(). - for (i = 0; i < bnode->nr_records; i++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, bnode->records[i], 0); - - vfree(bnode->records[i]); - } - } - rcu_lock_release(&rcu_callback_map); - } - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (put_cached_bnode(krcp, bnode)) - bnode = NULL; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (bnode) - free_page((unsigned long) bnode); - - cond_resched_tasks_rcu_qs(); -} - -static void -kvfree_rcu_list(struct rcu_head *head) -{ - struct rcu_head *next; - - for (; head; head = next) { - void *ptr = (void *) head->func; - unsigned long offset = (void *) head - ptr; - - next = head->next; - debug_rcu_head_unqueue((struct rcu_head *)ptr); - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); - - rcu_lock_release(&rcu_callback_map); - cond_resched_tasks_rcu_qs(); - } -} - -/* - * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bulk_head_free or ->head_free. - */ -static void kfree_rcu_work(struct work_struct *work) -{ - unsigned long flags; - struct kvfree_rcu_bulk_data *bnode, *n; - struct list_head bulk_head[FREE_N_CHANNELS]; - struct rcu_head *head; - struct kfree_rcu_cpu *krcp; - struct kfree_rcu_cpu_work *krwp; - struct rcu_gp_oldstate head_gp_snap; - int i; - - krwp = container_of(to_rcu_work(work), - struct kfree_rcu_cpu_work, rcu_work); - krcp = krwp->krcp; - - raw_spin_lock_irqsave(&krcp->lock, flags); - // Channels 1 and 2. - for (i = 0; i < FREE_N_CHANNELS; i++) - list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); - - // Channel 3. - head = krwp->head_free; - krwp->head_free = NULL; - head_gp_snap = krwp->head_free_gp_snap; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - // Handle the first two channels. - for (i = 0; i < FREE_N_CHANNELS; i++) { - // Start from the tail page, so a GP is likely passed for it. - list_for_each_entry_safe(bnode, n, &bulk_head[i], list) - kvfree_rcu_bulk(krcp, bnode, i); - } - - /* - * This is used when the "bulk" path can not be used for the - * double-argument of kvfree_rcu(). This happens when the - * page-cache is empty, which means that objects are instead - * queued on a linked list through their rcu_head structures. - * This list is named "Channel 3". - */ - if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap))) - kvfree_rcu_list(head); -} - -static bool -need_offload_krc(struct kfree_rcu_cpu *krcp) -{ - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - if (!list_empty(&krcp->bulk_head[i])) - return true; - - return !!READ_ONCE(krcp->head); -} - -static bool -need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) -{ - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - if (!list_empty(&krwp->bulk_head_free[i])) - return true; - - return !!krwp->head_free; -} - -static int krc_count(struct kfree_rcu_cpu *krcp) -{ - int sum = atomic_read(&krcp->head_count); - int i; - - for (i = 0; i < FREE_N_CHANNELS; i++) - sum += atomic_read(&krcp->bulk_count[i]); - - return sum; -} - -static void -__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) -{ - long delay, delay_left; - - delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; - if (delayed_work_pending(&krcp->monitor_work)) { - delay_left = krcp->monitor_work.timer.expires - jiffies; - if (delay < delay_left) - mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); - return; - } - queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay); -} - -static void -schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&krcp->lock, flags); - __schedule_delayed_monitor_work(krcp); - raw_spin_unlock_irqrestore(&krcp->lock, flags); -} - -static void -kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) -{ - struct list_head bulk_ready[FREE_N_CHANNELS]; - struct kvfree_rcu_bulk_data *bnode, *n; - struct rcu_head *head_ready = NULL; - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&krcp->lock, flags); - for (i = 0; i < FREE_N_CHANNELS; i++) { - INIT_LIST_HEAD(&bulk_ready[i]); - - list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { - if (!poll_state_synchronize_rcu_full(&bnode->gp_snap)) - break; - - atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); - list_move(&bnode->list, &bulk_ready[i]); - } - } - - if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { - head_ready = krcp->head; - atomic_set(&krcp->head_count, 0); - WRITE_ONCE(krcp->head, NULL); - } - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - for (i = 0; i < FREE_N_CHANNELS; i++) { - list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) - kvfree_rcu_bulk(krcp, bnode, i); - } - - if (head_ready) - kvfree_rcu_list(head_ready); -} - -/* - * Return: %true if a work is queued, %false otherwise. - */ -static bool -kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) -{ - unsigned long flags; - bool queued = false; - int i, j; - - raw_spin_lock_irqsave(&krcp->lock, flags); - - // Attempt to start a new batch. - for (i = 0; i < KFREE_N_BATCHES; i++) { - struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - - // Try to detach bulk_head or head and attach it, only when - // all channels are free. Any channel is not free means at krwp - // there is on-going rcu work to handle krwp's free business. - if (need_wait_for_krwp_work(krwp)) - continue; - - // kvfree_rcu_drain_ready() might handle this krcp, if so give up. - if (need_offload_krc(krcp)) { - // Channel 1 corresponds to the SLAB-pointer bulk path. - // Channel 2 corresponds to vmalloc-pointer bulk path. - for (j = 0; j < FREE_N_CHANNELS; j++) { - if (list_empty(&krwp->bulk_head_free[j])) { - atomic_set(&krcp->bulk_count[j], 0); - list_replace_init(&krcp->bulk_head[j], - &krwp->bulk_head_free[j]); - } - } - - // Channel 3 corresponds to both SLAB and vmalloc - // objects queued on the linked list. - if (!krwp->head_free) { - krwp->head_free = krcp->head; - get_state_synchronize_rcu_full(&krwp->head_free_gp_snap); - atomic_set(&krcp->head_count, 0); - WRITE_ONCE(krcp->head, NULL); - } - - // One work is per one batch, so there are three - // "free channels", the batch can handle. Break - // the loop since it is done with this CPU thus - // queuing an RCU work is _always_ success here. - queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work); - WARN_ON_ONCE(!queued); - break; - } - } - - raw_spin_unlock_irqrestore(&krcp->lock, flags); - return queued; -} - -/* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. - */ -static void kfree_rcu_monitor(struct work_struct *work) -{ - struct kfree_rcu_cpu *krcp = container_of(work, - struct kfree_rcu_cpu, monitor_work.work); - - // Drain ready for reclaim. - kvfree_rcu_drain_ready(krcp); - - // Queue a batch for a rest. - kvfree_rcu_queue_batch(krcp); - - // If there is nothing to detach, it means that our job is - // successfully done here. In case of having at least one - // of the channels that is still busy we should rearm the - // work to repeat an attempt. Because previous batches are - // still in progress. - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); -} - -static enum hrtimer_restart -schedule_page_work_fn(struct hrtimer *t) -{ - struct kfree_rcu_cpu *krcp = - container_of(t, struct kfree_rcu_cpu, hrtimer); - - queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); - return HRTIMER_NORESTART; -} - -static void fill_page_cache_func(struct work_struct *work) -{ - struct kvfree_rcu_bulk_data *bnode; - struct kfree_rcu_cpu *krcp = - container_of(work, struct kfree_rcu_cpu, - page_cache_work.work); - unsigned long flags; - int nr_pages; - bool pushed; - int i; - - nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ? - 1 : rcu_min_cached_objs; - - for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) { - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - - if (!bnode) - break; - - raw_spin_lock_irqsave(&krcp->lock, flags); - pushed = put_cached_bnode(krcp, bnode); - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (!pushed) { - free_page((unsigned long) bnode); - break; - } - } - - atomic_set(&krcp->work_in_progress, 0); - atomic_set(&krcp->backoff_page_cache_fill, 0); -} - -static void -run_page_cache_worker(struct kfree_rcu_cpu *krcp) -{ - // If cache disabled, bail out. - if (!rcu_min_cached_objs) - return; - - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !atomic_xchg(&krcp->work_in_progress, 1)) { - if (atomic_read(&krcp->backoff_page_cache_fill)) { - queue_delayed_work(system_unbound_wq, - &krcp->page_cache_work, - msecs_to_jiffies(rcu_delay_page_cache_fill_msec)); - } else { - hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - krcp->hrtimer.function = schedule_page_work_fn; - hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); - } - } -} - -// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() -// state specified by flags. If can_alloc is true, the caller must -// be schedulable and not be holding any locks or mutexes that might be -// acquired by the memory allocator or anything that it might invoke. -// Returns true if ptr was successfully recorded, else the caller must -// use a fallback. -static inline bool -add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, - unsigned long *flags, void *ptr, bool can_alloc) -{ - struct kvfree_rcu_bulk_data *bnode; - int idx; - - *krcp = krc_this_cpu_lock(flags); - if (unlikely(!(*krcp)->initialized)) - return false; - - idx = !!is_vmalloc_addr(ptr); - bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], - struct kvfree_rcu_bulk_data, list); - - /* Check if a new block is required. */ - if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { - bnode = get_cached_bnode(*krcp); - if (!bnode && can_alloc) { - krc_this_cpu_unlock(*krcp, *flags); - - // __GFP_NORETRY - allows a light-weight direct reclaim - // what is OK from minimizing of fallback hitting point of - // view. Apart of that it forbids any OOM invoking what is - // also beneficial since we are about to release memory soon. - // - // __GFP_NOMEMALLOC - prevents from consuming of all the - // memory reserves. Please note we have a fallback path. - // - // __GFP_NOWARN - it is supposed that an allocation can - // be failed under low memory or high memory pressure - // scenarios. - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - raw_spin_lock_irqsave(&(*krcp)->lock, *flags); - } - - if (!bnode) - return false; - - // Initialize the new block and attach it. - bnode->nr_records = 0; - list_add(&bnode->list, &(*krcp)->bulk_head[idx]); - } - - // Finally insert and update the GP for this page. - bnode->nr_records++; - bnode->records[bnode->nr_records - 1] = ptr; - get_state_synchronize_rcu_full(&bnode->gp_snap); - atomic_inc(&(*krcp)->bulk_count[idx]); - - return true; -} - -/* - * Queue a request for lazy invocation of the appropriate free routine - * after a grace period. Please note that three paths are maintained, - * two for the common case using arrays of pointers and a third one that - * is used only when the main paths cannot be used, for example, due to - * memory pressure. - * - * Each kvfree_call_rcu() request is added to a batch. The batch will be drained - * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will - * be free'd in workqueue context. This allows us to: batch requests together to - * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. - */ -void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - unsigned long flags; - struct kfree_rcu_cpu *krcp; - bool success; - - /* - * Please note there is a limitation for the head-less - * variant, that is why there is a clear rule for such - * objects: it can be used from might_sleep() context - * only. For other places please embed an rcu_head to - * your data. - */ - if (!head) - might_sleep(); - - // Queue the object but don't yet schedule the batch. - if (debug_rcu_head_queue(ptr)) { - // Probable double kfree_rcu(), just leak. - WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", - __func__, head); - - // Mark as success and leave. - return; - } - - kasan_record_aux_stack_noalloc(ptr); - success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); - if (!success) { - run_page_cache_worker(krcp); - - if (head == NULL) - // Inline if kvfree_rcu(one_arg) call. - goto unlock_return; - - head->func = ptr; - head->next = krcp->head; - WRITE_ONCE(krcp->head, head); - atomic_inc(&krcp->head_count); - - // Take a snapshot for this krcp. - krcp->head_gp_snap = get_state_synchronize_rcu(); - success = true; - } - - /* - * The kvfree_rcu() caller considers the pointer freed at this point - * and likely removes any references to it. Since the actual slab - * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore - * this object (no scanning or false positives reporting). - */ - kmemleak_ignore(ptr); - - // Set timer to drain after KFREE_DRAIN_JIFFIES. - if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - __schedule_delayed_monitor_work(krcp); - -unlock_return: - krc_this_cpu_unlock(krcp, flags); - - /* - * Inline kvfree() after synchronize_rcu(). We can do - * it from might_sleep() context only, so the current - * CPU can pass the QS state. - */ - if (!success) { - debug_rcu_head_unqueue((struct rcu_head *) ptr); - synchronize_rcu(); - kvfree(ptr); - } -} -EXPORT_SYMBOL_GPL(kvfree_call_rcu); - -/** - * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. - * - * Note that a single argument of kvfree_rcu() call has a slow path that - * triggers synchronize_rcu() following by freeing a pointer. It is done - * before the return from the function. Therefore for any single-argument - * call that will result in a kfree() to a cache that is to be destroyed - * during module exit, it is developer's responsibility to ensure that all - * such calls have returned before the call to kmem_cache_destroy(). - */ -void kvfree_rcu_barrier(void) -{ - struct kfree_rcu_cpu_work *krwp; - struct kfree_rcu_cpu *krcp; - bool queued; - int i, cpu; - - /* - * Firstly we detach objects and queue them over an RCU-batch - * for all CPUs. Finally queued works are flushed for each CPU. - * - * Please note. If there are outstanding batches for a particular - * CPU, those have to be finished first following by queuing a new. - */ - for_each_possible_cpu(cpu) { - krcp = per_cpu_ptr(&krc, cpu); - - /* - * Check if this CPU has any objects which have been queued for a - * new GP completion. If not(means nothing to detach), we are done - * with it. If any batch is pending/running for this "krcp", below - * per-cpu flush_rcu_work() waits its completion(see last step). - */ - if (!need_offload_krc(krcp)) - continue; - - while (1) { - /* - * If we are not able to queue a new RCU work it means: - * - batches for this CPU are still in flight which should - * be flushed first and then repeat; - * - no objects to detach, because of concurrency. - */ - queued = kvfree_rcu_queue_batch(krcp); - - /* - * Bail out, if there is no need to offload this "krcp" - * anymore. As noted earlier it can run concurrently. - */ - if (queued || !need_offload_krc(krcp)) - break; - - /* There are ongoing batches. */ - for (i = 0; i < KFREE_N_BATCHES; i++) { - krwp = &(krcp->krw_arr[i]); - flush_rcu_work(&krwp->rcu_work); - } - } - } - - /* - * Now we guarantee that all objects are flushed. - */ - for_each_possible_cpu(cpu) { - krcp = per_cpu_ptr(&krc, cpu); - - /* - * A monitor work can drain ready to reclaim objects - * directly. Wait its completion if running or pending. - */ - cancel_delayed_work_sync(&krcp->monitor_work); - - for (i = 0; i < KFREE_N_BATCHES; i++) { - krwp = &(krcp->krw_arr[i]); - flush_rcu_work(&krwp->rcu_work); - } - } -} -EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); - -static unsigned long -kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) -{ - int cpu; - unsigned long count = 0; - - /* Snapshot count of all CPUs */ - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - count += krc_count(krcp); - count += READ_ONCE(krcp->nr_bkv_objs); - atomic_set(&krcp->backoff_page_cache_fill, 1); - } - - return count == 0 ? SHRINK_EMPTY : count; -} - -static unsigned long -kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) -{ - int cpu, freed = 0; - - for_each_possible_cpu(cpu) { - int count; - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - count = krc_count(krcp); - count += drain_page_cache(krcp); - kfree_rcu_monitor(&krcp->monitor_work.work); - - sc->nr_to_scan -= count; - freed += count; - - if (sc->nr_to_scan <= 0) - break; - } - - return freed == 0 ? SHRINK_STOP : freed; -} - -void __init kfree_rcu_scheduler_running(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - if (need_offload_krc(krcp)) - schedule_delayed_monitor_work(krcp); - } -} - /* * During early boot, any blocking grace-period wait automatically * implies a grace period. @@ -4895,6 +4071,22 @@ rcu_boot_init_percpu_data(int cpu) rcu_boot_init_nocb_percpu_data(rdp); } +static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp) +{ + cpumask_var_t affinity; + int cpu; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return; + + for_each_leaf_node_possible_cpu(rnp, cpu) + cpumask_set_cpu(cpu, affinity); + + kthread_affine_preferred(t, affinity); + + free_cpumask_var(affinity); +} + struct kthread_worker *rcu_exp_gp_kworker; static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) @@ -4917,16 +4109,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); -} - -static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) -{ - struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker); - if (!kworker) - return NULL; - - return kworker->task; + rcu_thread_affine_rnp(kworker->task, rnp); + wake_up_process(kworker->task); } static void __init rcu_start_exp_gp_kworker(void) @@ -4934,7 +4119,7 @@ static void __init rcu_start_exp_gp_kworker(void) const char *name = "rcu_exp_gp_kthread_worker"; struct sched_param param = { .sched_priority = kthread_prio }; - rcu_exp_gp_kworker = kthread_create_worker(0, name); + rcu_exp_gp_kworker = kthread_run_worker(0, name); if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { pr_err("Failed to create %s!\n", name); rcu_exp_gp_kworker = NULL; @@ -5012,67 +4197,6 @@ int rcutree_prepare_cpu(unsigned int cpu) } /* - * Update kthreads affinity during CPU-hotplug changes. - * - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - * - * Any future concurrent calls are serialized via ->kthread_mutex. - */ -static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) -{ - cpumask_var_t cm; - unsigned long mask; - struct rcu_data *rdp; - struct rcu_node *rnp; - struct task_struct *task_boost, *task_exp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - - task_boost = rcu_boost_task(rnp); - task_exp = rcu_exp_par_gp_task(rnp); - - /* - * If CPU is the boot one, those tasks are created later from early - * initcall since kthreadd must be created first. - */ - if (!task_boost && !task_exp) - return; - - if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) - return; - - mutex_lock(&rnp->kthread_mutex); - mask = rcu_rnp_online_cpus(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) - if ((mask & leaf_node_cpu_bit(rnp, cpu)) && - cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) { - cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (outgoingcpu >= 0) - cpumask_clear_cpu(outgoingcpu, cm); - } - - if (task_exp) - set_cpus_allowed_ptr(task_exp, cm); - - if (task_boost) - set_cpus_allowed_ptr(task_boost, cm); - - mutex_unlock(&rnp->kthread_mutex); - - free_cpumask_var(cm); -} - -/* * Has the specified (known valid) CPU ever been fully online? */ bool rcu_cpu_beenfullyonline(int cpu) @@ -5100,7 +4224,6 @@ int rcutree_online_cpu(unsigned int cpu) if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ sync_sched_exp_online_cleanup(cpu); - rcutree_affinity_setting(cpu, -1); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); @@ -5317,8 +4440,6 @@ int rcutree_offline_cpu(unsigned int cpu) rnp->ffmask &= ~rdp->grpmask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcutree_affinity_setting(cpu, cpu); - // nohz_full CPUs need the tick for stop-machine to work quickly tick_dep_set(TICK_DEP_BIT_RCU); return 0; @@ -5648,62 +4769,12 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; -static void __init kfree_rcu_batch_init(void) -{ - int cpu; - int i, j; - struct shrinker *kfree_rcu_shrinker; - - /* Clamp it to [0:100] seconds interval. */ - if (rcu_delay_page_cache_fill_msec < 0 || - rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) { - - rcu_delay_page_cache_fill_msec = - clamp(rcu_delay_page_cache_fill_msec, 0, - (int) (100 * MSEC_PER_SEC)); - - pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n", - rcu_delay_page_cache_fill_msec); - } - - for_each_possible_cpu(cpu) { - struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - - for (i = 0; i < KFREE_N_BATCHES; i++) { - INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); - krcp->krw_arr[i].krcp = krcp; - - for (j = 0; j < FREE_N_CHANNELS; j++) - INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); - } - - for (i = 0; i < FREE_N_CHANNELS; i++) - INIT_LIST_HEAD(&krcp->bulk_head[i]); - - INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); - INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); - krcp->initialized = true; - } - - kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree"); - if (!kfree_rcu_shrinker) { - pr_err("Failed to allocate kfree_rcu() shrinker!\n"); - return; - } - - kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; - kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; - - shrinker_register(kfree_rcu_shrinker); -} - void __init rcu_init(void) { int cpu = smp_processor_id(); rcu_early_boot_tests(); - kfree_rcu_batch_init(); rcu_bootup_announce(); sanitize_kthread_prio(); rcu_init_geometry(); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index fb664d3a01c9..77efed89c79e 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -227,16 +227,16 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake) /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. + * specified leaf rcu_node structure, which is acquired by the caller. */ -static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, +static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags, unsigned long mask, bool wake) + __releases(rnp->lock) { int cpu; - unsigned long flags; struct rcu_data *rdp; - raw_spin_lock_irqsave_rcu_node(rnp, flags); + raw_lockdep_assert_held_rcu_node(rnp); if (!(rnp->expmask & mask)) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; @@ -257,8 +257,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { + unsigned long flags; + struct rcu_node *rnp = rdp->mynode; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); - rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); + ASSERT_EXCLUSIVE_WRITER(rdp->cpu_no_qs.b.exp); + rcu_report_exp_cpu_mult(rnp, flags, rdp->grpmask, true); } /* Common code for work-done checking. */ @@ -432,8 +437,10 @@ retry_ipi: raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false); + if (mask_ofl_test) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rcu_report_exp_cpu_mult(rnp, flags, mask_ofl_test, false); + } } static void rcu_exp_sel_wait_wake(unsigned long s); @@ -712,6 +719,18 @@ static void rcu_exp_sel_wait_wake(unsigned long s) rcu_exp_wait_wake(s); } +/* Request an expedited quiescent state. */ +static void rcu_exp_need_qs(void) +{ + lockdep_assert_irqs_disabled(); + ASSERT_EXCLUSIVE_WRITER_SCOPED(*this_cpu_ptr(&rcu_data.cpu_no_qs.b.exp)); + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + #ifdef CONFIG_PREEMPT_RCU /* @@ -730,24 +749,34 @@ static void rcu_exp_handler(void *unused) struct task_struct *t = current; /* - * First, the common case of not being in an RCU read-side + * First, is there no need for a quiescent state from this CPU, + * or is this CPU already looking for a quiescent state for the + * current grace period? If either is the case, just leave. + * However, this should not happen due to the preemptible + * sync_sched_exp_online_cleanup() implementation being a no-op, + * so warn if this does happen. + */ + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); + if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + READ_ONCE(rdp->cpu_no_qs.b.exp))) + return; + + /* + * Second, the common case of not being in an RCU read-side * critical section. If also enabled or idle, immediately * report the quiescent state, otherwise defer. */ if (!depth) { if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || - rcu_is_cpu_rrupt_from_idle()) { + rcu_is_cpu_rrupt_from_idle()) rcu_report_exp_rdp(rdp); - } else { - WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + else + rcu_exp_need_qs(); return; } /* - * Second, the less-common case of being in an RCU read-side + * Third, the less-common case of being in an RCU read-side * critical section. In this case we can count on a future * rcu_read_unlock(). However, this rcu_read_unlock() might * execute on some other CPU, but in that case there will be @@ -768,7 +797,7 @@ static void rcu_exp_handler(void *unused) return; } - // Finally, negative nesting depth should not happen. + // Fourth and finally, negative nesting depth should not happen. WARN_ON_ONCE(1); } @@ -835,16 +864,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) #else /* #ifdef CONFIG_PREEMPT_RCU */ -/* Request an expedited quiescent state. */ -static void rcu_exp_need_qs(void) -{ - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); -} - /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { @@ -852,6 +871,7 @@ static void rcu_exp_handler(void *unused) struct rcu_node *rnp = rdp->mynode; bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3927ea5f7955..3600152b858e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -275,6 +275,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rcu_report_exp_rdp(rdp); else WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); + ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); } /* @@ -1217,16 +1218,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + rcu_thread_affine_rnp(t, rnp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ } -static struct task_struct *rcu_boost_task(struct rcu_node *rnp) -{ - return READ_ONCE(rnp->boost_kthread_task); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1243,10 +1241,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { } -static struct task_struct *rcu_boost_task(struct rcu_node *rnp) -{ - return NULL; -} #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f8436969e0c8..c912b594ba98 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -527,12 +527,12 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) /* Get rcutorture access to sched_setaffinity(). */ -long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn) { int ret; ret = sched_setaffinity(pid, in_mask); - WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); + WARN_ONCE(dowarn && ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); return ret; } EXPORT_SYMBOL_GPL(torture_sched_setaffinity); diff --git a/kernel/rseq.c b/kernel/rseq.c index 9de6e35fe679..442aba29bc4c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -13,6 +13,7 @@ #include <linux/syscalls.h> #include <linux/rseq.h> #include <linux/types.h> +#include <linux/ratelimit.h> #include <asm/ptrace.h> #define CREATE_TRACE_POINTS @@ -25,6 +26,78 @@ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) +#ifdef CONFIG_DEBUG_RSEQ +static struct rseq *rseq_kernel_fields(struct task_struct *t) +{ + return (struct rseq *) t->rseq_fields; +} + +static int rseq_validate_ro_fields(struct task_struct *t) +{ + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + u32 cpu_id_start, cpu_id, node_id, mm_cid; + struct rseq __user *rseq = t->rseq; + + /* + * Validate fields which are required to be read-only by + * user-space. + */ + if (!user_read_access_begin(rseq, t->rseq_len)) + goto efault; + unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); + unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); + unsafe_get_user(node_id, &rseq->node_id, efault_end); + unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); + user_read_access_end(); + + if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || + cpu_id != rseq_kernel_fields(t)->cpu_id || + node_id != rseq_kernel_fields(t)->node_id || + mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { + + pr_warn("Detected rseq corruption for pid: %d, name: %s\n" + "\tcpu_id_start: %u ?= %u\n" + "\tcpu_id: %u ?= %u\n" + "\tnode_id: %u ?= %u\n" + "\tmm_cid: %u ?= %u\n", + t->pid, t->comm, + cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, + cpu_id, rseq_kernel_fields(t)->cpu_id, + node_id, rseq_kernel_fields(t)->node_id, + mm_cid, rseq_kernel_fields(t)->mm_cid); + } + + /* For now, only print a console warning on mismatch. */ + return 0; + +efault_end: + user_read_access_end(); +efault: + return -EFAULT; +} + +static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, + u32 node_id, u32 mm_cid) +{ + rseq_kernel_fields(t)->cpu_id_start = cpu_id; + rseq_kernel_fields(t)->cpu_id = cpu_id; + rseq_kernel_fields(t)->node_id = node_id; + rseq_kernel_fields(t)->mm_cid = mm_cid; +} +#else +static int rseq_validate_ro_fields(struct task_struct *t) +{ + return 0; +} + +static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id, + u32 node_id, u32 mm_cid) +{ +} +#endif + /* * * Restartable sequences are a lightweight interface that allows @@ -92,6 +165,11 @@ static int rseq_update_cpu_node_id(struct task_struct *t) u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); + /* + * Validate read-only rseq fields. + */ + if (rseq_validate_ro_fields(t)) + goto efault; WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; @@ -105,6 +183,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t) * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); + rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid); trace_rseq_update(t); return 0; @@ -120,6 +199,11 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) mm_cid = 0; /* + * Validate read-only rseq fields. + */ + if (rseq_validate_ro_fields(t)) + return -EFAULT; + /* * Reset cpu_id_start to its initial state (0). */ if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) @@ -141,6 +225,9 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) */ if (put_user(mm_cid, &t->rseq->mm_cid)) return -EFAULT; + + rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid); + /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if @@ -423,6 +510,17 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, current->rseq = rseq; current->rseq_len = rseq_len; current->rseq_sig = sig; +#ifdef CONFIG_DEBUG_RSEQ + /* + * Initialize the in-kernel rseq fields copy for validation of + * read-only fields. + */ + if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || + get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || + get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || + get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) + return -EFAULT; +#endif /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3e5a6bf587f9..88a9a515b2ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -740,39 +740,43 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) s64 __maybe_unused steal = 0, irq_delta = 0; #ifdef CONFIG_IRQ_TIME_ACCOUNTING - irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + if (irqtime_enabled()) { + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; - /* - * Since irq_time is only updated on {soft,}irq_exit, we might run into - * this case when a previous update_rq_clock() happened inside a - * {soft,}IRQ region. - * - * When this happens, we stop ->clock_task and only update the - * prev_irq_time stamp to account for the part that fit, so that a next - * update will consume the rest. This ensures ->clock_task is - * monotonic. - * - * It does however cause some slight miss-attribution of {soft,}IRQ - * time, a more accurate solution would be to update the irq_time using - * the current rq->clock timestamp, except that would require using - * atomic ops. - */ - if (irq_delta > delta) - irq_delta = delta; + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}IRQ region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}IRQ + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; - rq->prev_irq_time += irq_delta; - delta -= irq_delta; - delayacct_irq(rq->curr, irq_delta); + rq->prev_irq_time += irq_delta; + delta -= irq_delta; + delayacct_irq(rq->curr, irq_delta); + } #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { - steal = paravirt_steal_clock(cpu_of(rq)); + u64 prev_steal; + + steal = prev_steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) steal = delta; - rq->prev_steal_time_rq += steal; + rq->prev_steal_time_rq = prev_steal; delta -= steal; } #endif @@ -1168,13 +1172,13 @@ int get_nohz_timer_target(void) struct sched_domain *sd; const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); guard(rcu)(); @@ -1189,7 +1193,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); return default_cpu; } @@ -1341,7 +1345,7 @@ bool sched_can_stop_tick(struct rq *rq) if (scx_enabled() && !scx_can_stop_tick(rq)) return false; - if (rq->cfs.h_nr_running > 1) + if (rq->cfs.h_nr_queued > 1) return false; /* @@ -3534,7 +3538,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * * More yuck to audit. */ - do_set_cpus_allowed(p, task_cpu_possible_mask(p)); + do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: @@ -5632,7 +5636,7 @@ void sched_tick(void) unsigned long hw_pressure; u64 resched_latency; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) arch_scale_freq_tick(); sched_clock_tick(); @@ -5771,7 +5775,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -5792,7 +5796,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) return; WARN_ON_ONCE(!tick_work_cpu); @@ -6018,7 +6022,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * opportunity to pull in more work from other CPUs. */ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && - rq->nr_running == rq->cfs.h_nr_running)) { + rq->nr_running == rq->cfs.h_nr_queued)) { p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) @@ -6641,7 +6645,6 @@ static void __sched notrace __schedule(int sched_mode) * as a preemption by schedule_debug() and RCU. */ bool preempt = sched_mode > SM_NONE; - bool block = false; unsigned long *switch_count; unsigned long prev_state; struct rq_flags rf; @@ -6702,7 +6705,7 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - block = try_to_block_task(rq, prev, prev_state); + try_to_block_task(rq, prev, prev_state); switch_count = &prev->nvcsw; } @@ -6748,7 +6751,8 @@ picked: migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); - psi_sched_switch(prev, next, block); + psi_sched_switch(prev, next, !task_on_rq_queued(prev) || + prev->se.sched_delayed); trace_sched_switch(preempt, prev, next, prev_state); @@ -8180,19 +8184,14 @@ static void cpuset_cpu_active(void) cpuset_update_active_cpus(); } -static int cpuset_cpu_inactive(unsigned int cpu) +static void cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - int ret = dl_bw_check_overflow(cpu); - - if (ret) - return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); } - return 0; } static inline void sched_smt_present_inc(int cpu) @@ -8254,6 +8253,11 @@ int sched_cpu_deactivate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); int ret; + ret = dl_bw_deactivate(cpu); + + if (ret) + return ret; + /* * Remove CPU from nohz.idle_cpus_mask to prevent participating in * load balancing when not active @@ -8299,15 +8303,7 @@ int sched_cpu_deactivate(unsigned int cpu) return 0; sched_update_numa(cpu, false); - ret = cpuset_cpu_inactive(cpu); - if (ret) { - sched_smt_present_inc(cpu); - sched_set_rq_online(rq, cpu); - balance_push_set(cpu, false); - set_cpu_active(cpu, true); - sched_update_numa(cpu, true); - return ret; - } + cpuset_cpu_inactive(cpu); sched_domains_numa_masks_clear(cpu); return 0; } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 28c77904ea74..a2a29e3fffca 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -83,7 +83,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (unlikely(sg_policy->limits_changed)) { sg_policy->limits_changed = false; - sg_policy->need_freq_update = true; + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); return true; } @@ -96,7 +96,7 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { if (sg_policy->need_freq_update) - sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + sg_policy->need_freq_update = false; else if (sg_policy->next_freq == next_freq) return false; @@ -604,31 +604,6 @@ static const struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -#ifdef CONFIG_ENERGY_MODEL -static void rebuild_sd_workfn(struct work_struct *work) -{ - rebuild_sched_domains_energy(); -} - -static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); - -/* - * EAS shouldn't be attempted without sugov, so rebuild the sched_domains - * on governor changes to make sure the scheduler knows about it. - */ -static void sugov_eas_rebuild_sd(void) -{ - /* - * When called from the cpufreq_register_driver() path, the - * cpu_hotplug_lock is already held, so use a work item to - * avoid nested locking in rebuild_sched_domains(). - */ - schedule_work(&rebuild_sd_work); -} -#else -static inline void sugov_eas_rebuild_sd(void) { }; -#endif - struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) @@ -784,7 +759,11 @@ static int sugov_init(struct cpufreq_policy *policy) goto fail; out: - sugov_eas_rebuild_sd(); + /* + * Schedutil is the preferred governor for EAS, so rebuild sched domains + * on governor changes to make sure the scheduler knows about them. + */ + em_rebuild_sched_domains(); mutex_unlock(&global_tunables_lock); return 0; @@ -826,7 +805,7 @@ static void sugov_exit(struct cpufreq_policy *policy) sugov_policy_free(sg_policy); cpufreq_disable_fast_switch(policy); - sugov_eas_rebuild_sd(); + em_rebuild_sched_domains(); } static int sugov_start(struct cpufreq_policy *policy) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 0bed0fa1acd9..5d9143dd0879 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -9,6 +9,8 @@ #ifdef CONFIG_IRQ_TIME_ACCOUNTING +DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime); + /* * There are no locks covering percpu hardirq/softirq time. * They are only modified in vtime_account, on corresponding CPU @@ -22,16 +24,14 @@ */ DEFINE_PER_CPU(struct irqtime, cpu_irqtime); -static int sched_clock_irqtime; - void enable_sched_clock_irqtime(void) { - sched_clock_irqtime = 1; + static_branch_enable(&sched_clock_irqtime); } void disable_sched_clock_irqtime(void) { - sched_clock_irqtime = 0; + static_branch_disable(&sched_clock_irqtime); } static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, @@ -57,7 +57,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset) s64 delta; int cpu; - if (!sched_clock_irqtime) + if (!irqtime_enabled()) return; cpu = smp_processor_id(); @@ -90,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime) #else /* CONFIG_IRQ_TIME_ACCOUNTING */ -#define sched_clock_irqtime (0) - static u64 irqtime_tick_accounted(u64 dummy) { return 0; @@ -478,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (vtime_accounting_enabled_this_cpu()) return; - if (sched_clock_irqtime) { + if (irqtime_enabled()) { irqtime_account_process_tick(p, user_tick, 1); return; } @@ -507,7 +505,7 @@ void account_idle_ticks(unsigned long ticks) { u64 cputime, steal; - if (sched_clock_irqtime) { + if (irqtime_enabled()) { irqtime_account_idle_ticks(ticks); return; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d94f2ed6d1f4..62192ac79c30 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -342,6 +342,29 @@ static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_s __add_rq_bw(new_bw, &rq->dl); } +static __always_inline +void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer) +{ + /* + * If the timer callback was running (hrtimer_try_to_cancel == -1), + * it will eventually call put_task_struct(). + */ + if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); +} + +static __always_inline +void cancel_replenish_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->dl_timer); +} + +static __always_inline +void cancel_inactive_timer(struct sched_dl_entity *dl_se) +{ + cancel_dl_timer(dl_se, &dl_se->inactive_timer); +} + static void dl_change_utilization(struct task_struct *p, u64 new_bw) { WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); @@ -495,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { - if (!dl_server(dl_se)) - put_task_struct(dl_task_of(dl_se)); - } + cancel_inactive_timer(dl_se); } else { /* * Since "dl_non_contending" is not set, the @@ -2115,13 +2135,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * The replenish timer needs to be canceled. No * problem if it fires concurrently: boosted threads * are ignored in dl_task_timer(). - * - * If the timer callback was running (hrtimer_try_to_cancel == -1), - * it will eventually call put_task_struct(). */ - if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 && - !dl_server(&p->dl)) - put_task_struct(p); + cancel_replenish_timer(&p->dl); p->dl.dl_throttled = 0; } } else if (!dl_prio(p->normal_prio)) { @@ -2289,8 +2304,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); } sub_rq_bw(&p->dl, &rq->dl); rq_unlock(rq, &rf); @@ -2506,16 +2520,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); - -next_node: - if (next_node) { + while (next_node) { p = __node_2_pdl(next_node); if (task_is_pushable(rq, p, cpu)) return p; next_node = rb_next(next_node); - goto next_node; } return NULL; @@ -2964,11 +2975,22 @@ void dl_add_task_root_domain(struct task_struct *p) void dl_clear_root_domain(struct root_domain *rd) { - unsigned long flags; + int i; - raw_spin_lock_irqsave(&rd->dl_bw.lock, flags); + guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); rd->dl_bw.total_bw = 0; - raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags); + + /* + * dl_server bandwidth is only restored when CPUs are attached to root + * domains (after domains are created or CPUs moved back to the + * default root doamin). + */ + for_each_cpu(i, rd->span) { + struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server; + + if (dl_server(dl_se) && cpu_active(i)) + rd->dl_bw.total_bw += dl_se->dl_bw; + } } #endif /* CONFIG_SMP */ @@ -3029,8 +3051,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + cancel_inactive_timer(&p->dl); /* * In case a task is setscheduled to SCHED_DEADLINE we need to keep @@ -3453,29 +3474,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, } enum dl_bw_request { - dl_bw_req_check_overflow = 0, + dl_bw_req_deactivate = 0, dl_bw_req_alloc, dl_bw_req_free }; static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow = 0; + u64 fair_server_bw = 0; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (req == dl_bw_req_free) { + cap = dl_bw_capacity(cpu); + switch (req) { + case dl_bw_req_free: __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu)); - } else { - unsigned long cap = dl_bw_capacity(cpu); - + break; + case dl_bw_req_alloc: overflow = __dl_overflow(dl_b, cap, 0, dl_bw); - if (req == dl_bw_req_alloc && !overflow) { + if (!overflow) { /* * We reserve space in the destination * root_domain, as we can't fail after this point. @@ -3484,6 +3507,42 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) */ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu)); } + break; + case dl_bw_req_deactivate: + /* + * cpu is not off yet, but we need to do the math by + * considering it off already (i.e., what would happen if we + * turn cpu off?). + */ + cap -= arch_scale_cpu_capacity(cpu); + + /* + * cpu is going offline and NORMAL tasks will be moved away + * from it. We can thus discount dl_server bandwidth + * contribution as it won't need to be servicing tasks after + * the cpu is off. + */ + if (cpu_rq(cpu)->fair_server.dl_server) + fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw; + + /* + * Not much to check if no DEADLINE bandwidth is present. + * dl_servers we can discount, as tasks will be moved out the + * offlined CPUs anyway. + */ + if (dl_b->total_bw - fair_server_bw > 0) { + /* + * Leaving at least one CPU for DEADLINE tasks seems a + * wise thing to do. As said above, cpu is not offline + * yet, so account for that. + */ + if (dl_bw_cpus(cpu) - 1) + overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0); + else + overflow = 1; + } + + break; } raw_spin_unlock_irqrestore(&dl_b->lock, flags); @@ -3492,9 +3551,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw) return overflow ? -EBUSY : 0; } -int dl_bw_check_overflow(int cpu) +int dl_bw_deactivate(int cpu) { - return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0); + return dl_bw_manage(dl_bw_req_deactivate, cpu, 0); } int dl_bw_alloc(int cpu, u64 dl_bw) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a1be00a988bf..fd7e85220715 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -379,7 +379,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return -EINVAL; } - if (rq->cfs.h_nr_running) { + if (rq->cfs.h_nr_queued) { update_rq_clock(rq); dl_server_stop(&rq->fair_server); } @@ -392,7 +392,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", cpu_of(rq)); - if (rq->cfs.h_nr_running) + if (rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); } @@ -843,13 +843,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); - SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); - SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); - SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", - cfs_rq->idle_nr_running); - SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", - cfs_rq->idle_h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", @@ -1295,8 +1292,10 @@ void resched_latency_warn(int cpu, u64 latency) { static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1); - WARN(__ratelimit(&latency_check_ratelimit), - "sched: CPU %d need_resched set for > %llu ns (%d ticks) " - "without schedule\n", - cpu, latency, cpu_rq(cpu)->ticks_without_resched); + if (likely(!__ratelimit(&latency_check_ratelimit))) + return; + + pr_err("sched: CPU %d need_resched set for > %llu ns (%d ticks) without schedule\n", + cpu, latency, cpu_rq(cpu)->ticks_without_resched); + dump_stack(); } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 19813b387ef9..7fee43426ee7 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5399,7 +5399,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name) { struct kthread_worker *helper; - helper = kthread_create_worker(0, name); + helper = kthread_run_worker(0, name); if (helper) sched_set_fifo(helper->task); return helper; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e9ca38512de..1e78caa21436 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -37,6 +37,7 @@ #include <linux/sched/cputime.h> #include <linux/sched/isolation.h> #include <linux/sched/nohz.h> +#include <linux/sched/prio.h> #include <linux/cpuidle.h> #include <linux/interrupt.h> @@ -51,6 +52,8 @@ #include <asm/switch_to.h> +#include <uapi/linux/sched/types.h> + #include "sched.h" #include "stats.h" #include "autogroup.h" @@ -523,7 +526,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); * Scheduling class tree data structure manipulation methods: */ -static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) +static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - max_vruntime); if (delta > 0) @@ -532,7 +535,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) return max_vruntime; } -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - min_vruntime); if (delta < 0) @@ -689,21 +692,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) * * XXX could add max_slice to the augmented data to track this. */ -static s64 entity_lag(u64 avruntime, struct sched_entity *se) +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { s64 vlag, limit; - vlag = avruntime - se->vruntime; - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); - - return clamp(vlag, -limit, limit); -} - -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ SCHED_WARN_ON(!se->on_rq); - se->vlag = entity_lag(avg_vruntime(cfs_rq), se); + vlag = avg_vruntime(cfs_rq) - se->vruntime; + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + + se->vlag = clamp(vlag, -limit, limit); } /* @@ -915,7 +913,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) * We can safely skip eligibility check if there is only one entity * in this cfs_rq, saving some cycles. */ - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se; if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) @@ -1250,7 +1248,7 @@ static void update_curr(struct cfs_rq *cfs_rq) account_cfs_rq_runtime(cfs_rq, delta_exec); - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_queued == 1) return; if (resched || did_preempt_short(cfs_rq, curr)) { @@ -2131,7 +2129,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->load += cpu_load(rq); ns->runnable += cpu_runnable(rq); ns->util += cpu_util_cfs(cpu); - ns->nr_running += rq->cfs.h_nr_running; + ns->nr_running += rq->cfs.h_nr_runnable; ns->compute_capacity += capacity_of(cpu); if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { @@ -3682,9 +3680,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &rq->cfs_tasks); } #endif - cfs_rq->nr_running++; - if (se_is_idle(se)) - cfs_rq->idle_nr_running++; + cfs_rq->nr_queued++; } static void @@ -3697,9 +3693,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } #endif - cfs_rq->nr_running--; - if (se_is_idle(se)) - cfs_rq->idle_nr_running--; + cfs_rq->nr_queued--; } /* @@ -3774,137 +3768,32 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif -static void reweight_eevdf(struct sched_entity *se, u64 avruntime, - unsigned long weight) -{ - unsigned long old_weight = se->load.weight; - s64 vlag, vslice; - - /* - * VRUNTIME - * -------- - * - * COROLLARY #1: The virtual runtime of the entity needs to be - * adjusted if re-weight at !0-lag point. - * - * Proof: For contradiction assume this is not true, so we can - * re-weight without changing vruntime at !0-lag point. - * - * Weight VRuntime Avg-VRuntime - * before w v V - * after w' v' V' - * - * Since lag needs to be preserved through re-weight: - * - * lag = (V - v)*w = (V'- v')*w', where v = v' - * ==> V' = (V - v)*w/w' + v (1) - * - * Let W be the total weight of the entities before reweight, - * since V' is the new weighted average of entities: - * - * V' = (WV + w'v - wv) / (W + w' - w) (2) - * - * by using (1) & (2) we obtain: - * - * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v - * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v - * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v - * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) - * - * Since we are doing at !0-lag point which means V != v, we - * can simplify (3): - * - * ==> W / (W + w' - w) = w / w' - * ==> Ww' = Ww + ww' - ww - * ==> W * (w' - w) = w * (w' - w) - * ==> W = w (re-weight indicates w' != w) - * - * So the cfs_rq contains only one entity, hence vruntime of - * the entity @v should always equal to the cfs_rq's weighted - * average vruntime @V, which means we will always re-weight - * at 0-lag point, thus breach assumption. Proof completed. - * - * - * COROLLARY #2: Re-weight does NOT affect weighted average - * vruntime of all the entities. - * - * Proof: According to corollary #1, Eq. (1) should be: - * - * (V - v)*w = (V' - v')*w' - * ==> v' = V' - (V - v)*w/w' (4) - * - * According to the weighted average formula, we have: - * - * V' = (WV - wv + w'v') / (W - w + w') - * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') - * = (WV - wv + w'V' - Vw + wv) / (W - w + w') - * = (WV + w'V' - Vw) / (W - w + w') - * - * ==> V'*(W - w + w') = WV + w'V' - Vw - * ==> V' * (W - w) = (W - w) * V (5) - * - * If the entity is the only one in the cfs_rq, then reweight - * always occurs at 0-lag point, so V won't change. Or else - * there are other entities, hence W != w, then Eq. (5) turns - * into V' = V. So V won't change in either case, proof done. - * - * - * So according to corollary #1 & #2, the effect of re-weight - * on vruntime should be: - * - * v' = V' - (V - v) * w / w' (4) - * = V - (V - v) * w / w' - * = V - vl * w / w' - * = V - vl' - */ - if (avruntime != se->vruntime) { - vlag = entity_lag(avruntime, se); - vlag = div_s64(vlag * old_weight, weight); - se->vruntime = avruntime - vlag; - } - - /* - * DEADLINE - * -------- - * - * When the weight changes, the virtual time slope changes and - * we should adjust the relative virtual deadline accordingly. - * - * d' = v' + (d - v)*w/w' - * = V' - (V - v)*w/w' + (d - v)*w/w' - * = V - (V - v)*w/w' + (d - v)*w/w' - * = V + (d - V)*w/w' - */ - vslice = (s64)(se->deadline - avruntime); - vslice = div_s64(vslice * old_weight, weight); - se->deadline = avruntime + vslice; -} +static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; - u64 avruntime; if (se->on_rq) { /* commit outstanding execution time */ update_curr(cfs_rq); - avruntime = avg_vruntime(cfs_rq); + update_entity_lag(cfs_rq, se); + se->deadline -= se->vruntime; + se->rel_deadline = 1; if (!curr) __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); - if (se->on_rq) { - reweight_eevdf(se, avruntime, weight); - } else { - /* - * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), - * we need to scale se->vlag when w_i changes. - */ - se->vlag = div_s64(se->vlag * se->load.weight, weight); - } + /* + * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), + * we need to scale se->vlag when w_i changes. + */ + se->vlag = div_s64(se->vlag * se->load.weight, weight); + if (se->rel_deadline) + se->deadline = div_s64(se->deadline * se->load.weight, weight); update_load_set(&se->load, weight); @@ -3919,6 +3808,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { update_load_add(&cfs_rq->load, se->load.weight); + place_entity(cfs_rq, se, 0); if (!curr) __enqueue_entity(cfs_rq, se); @@ -4065,7 +3955,11 @@ static void update_cfs_group(struct sched_entity *se) struct cfs_rq *gcfs_rq = group_cfs_rq(se); long shares; - if (!gcfs_rq) + /* + * When a group becomes empty, preserve its weight. This matters for + * DELAY_DEQUEUE. + */ + if (!gcfs_rq || !gcfs_rq->load.weight) return; if (throttled_hierarchy(gcfs_rq)) @@ -5233,7 +5127,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { - return !cfs_rq->nr_running; + return !cfs_rq->nr_queued; } #define UPDATE_TG 0x0 @@ -5271,6 +5165,22 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ +void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) +{ + struct sched_entity *se = &p->se; + + p->static_prio = NICE_TO_PRIO(attr->sched_nice); + if (attr->sched_runtime) { + se->custom_slice = 1; + se->slice = clamp_t(u64, attr->sched_runtime, + NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ + NSEC_PER_MSEC*100); /* HZ=100 / 10 */ + } else { + se->custom_slice = 0; + se->slice = sysctl_sched_base_slice; + } +} + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -5289,7 +5199,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ - if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) { + if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; @@ -5359,7 +5269,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->vruntime = vruntime - lag; - if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { + if (se->rel_deadline) { se->deadline += se->vruntime; se->rel_deadline = 0; return; @@ -5382,8 +5292,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); -static inline bool cfs_bandwidth_used(void); - static void requeue_delayed_entity(struct sched_entity *se); @@ -5405,7 +5313,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_runnable of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -5438,7 +5346,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) { + if (cfs_rq->nr_queued == 1) { check_enqueue_throttle(cfs_rq); if (!throttled_hierarchy(cfs_rq)) { list_add_leaf_cfs_rq(cfs_rq); @@ -5480,7 +5388,7 @@ static void set_delayed(struct sched_entity *se) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_delayed++; + cfs_rq->h_nr_runnable--; if (cfs_rq_throttled(cfs_rq)) break; } @@ -5492,7 +5400,7 @@ static void clear_delayed(struct sched_entity *se) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - cfs_rq->h_nr_delayed--; + cfs_rq->h_nr_runnable++; if (cfs_rq_throttled(cfs_rq)) break; } @@ -5509,6 +5417,7 @@ static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { bool sleep = flags & DEQUEUE_SLEEP; + int action = UPDATE_TG; update_curr(cfs_rq); clear_buddies(cfs_rq, se); @@ -5534,7 +5443,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } } - int action = UPDATE_TG; if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) action |= DO_DETACH; @@ -5542,7 +5450,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new - * h_nr_running of its group cfs_rq. + * h_nr_runnable of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. @@ -5580,7 +5488,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); - if (cfs_rq->nr_running == 0) + if (cfs_rq->nr_queued == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); return true; @@ -5642,17 +5550,19 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); static struct sched_entity * pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { + struct sched_entity *se; + /* - * Enabling NEXT_BUDDY will affect latency but not fairness. + * Picking the ->next buddy will affect latency but not fairness. */ - if (sched_feat(NEXT_BUDDY) && + if (sched_feat(PICK_BUDDY) && cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */ SCHED_WARN_ON(cfs_rq->next->sched_delayed); return cfs_rq->next; } - struct sched_entity *se = pick_eevdf(cfs_rq); + se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); /* @@ -5928,7 +5838,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) list_del_leaf_cfs_rq(cfs_rq); SCHED_WARN_ON(cfs_rq->throttled_clock_self); - if (cfs_rq->nr_running) + if (cfs_rq->nr_queued) cfs_rq->throttled_clock_self = rq_clock(rq); } cfs_rq->throttle_count++; @@ -5941,8 +5851,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, delayed_delta, dequeue = 1; - long rq_h_nr_running = rq->cfs.h_nr_running; + long queued_delta, runnable_delta, idle_delta, dequeue = 1; + long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5972,9 +5882,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock(); - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; - delayed_delta = cfs_rq->h_nr_delayed; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); int flags; @@ -5994,11 +5904,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, flags); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; - qcfs_rq->h_nr_delayed -= delayed_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -6017,18 +5927,18 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running -= task_delta; - qcfs_rq->idle_h_nr_running -= idle_task_delta; - qcfs_rq->h_nr_delayed -= delayed_delta; + qcfs_rq->h_nr_queued -= queued_delta; + qcfs_rq->h_nr_runnable -= runnable_delta; + qcfs_rq->h_nr_idle -= idle_delta; } /* At this point se is NULL and we are at root level*/ - sub_nr_running(rq, task_delta); + sub_nr_running(rq, queued_delta); /* Stop the fair server if throttling resulted in no runnable tasks */ - if (rq_h_nr_running && !rq->cfs.h_nr_running) + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) dl_server_stop(&rq->fair_server); done: /* @@ -6037,7 +5947,7 @@ done: */ cfs_rq->throttled = 1; SCHED_WARN_ON(cfs_rq->throttled_clock); - if (cfs_rq->nr_running) + if (cfs_rq->nr_queued) cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -6047,8 +5957,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, delayed_delta; - long rq_h_nr_running = rq->cfs.h_nr_running; + long queued_delta, runnable_delta, idle_delta; + long rq_h_nr_queued = rq->cfs.h_nr_queued; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6081,9 +5991,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) goto unthrottle_throttle; } - task_delta = cfs_rq->h_nr_running; - idle_task_delta = cfs_rq->idle_h_nr_running; - delayed_delta = cfs_rq->h_nr_delayed; + queued_delta = cfs_rq->h_nr_queued; + runnable_delta = cfs_rq->h_nr_runnable; + idle_delta = cfs_rq->h_nr_idle; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -6097,11 +6007,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; - qcfs_rq->h_nr_delayed += delayed_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6115,11 +6025,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) se_update_runnable(se); if (cfs_rq_is_idle(group_cfs_rq(se))) - idle_task_delta = cfs_rq->h_nr_running; + idle_delta = cfs_rq->h_nr_queued; - qcfs_rq->h_nr_running += task_delta; - qcfs_rq->idle_h_nr_running += idle_task_delta; - qcfs_rq->h_nr_delayed += delayed_delta; + qcfs_rq->h_nr_queued += queued_delta; + qcfs_rq->h_nr_runnable += runnable_delta; + qcfs_rq->h_nr_idle += idle_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6127,17 +6037,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } /* Start the fair server if un-throttling resulted in new runnable tasks */ - if (!rq_h_nr_running && rq->cfs.h_nr_running) + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); /* At this point se is NULL and we are at root level*/ - add_nr_running(rq, task_delta); + add_nr_running(rq, queued_delta); unthrottle_throttle: assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ - if (rq->curr == rq->idle && rq->cfs.nr_running) + if (rq->curr == rq->idle && rq->cfs.nr_queued) resched_curr(rq); } @@ -6438,7 +6348,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) + if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued) return; __return_cfs_rq_runtime(cfs_rq); @@ -6709,6 +6619,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) lockdep_assert_rq_held(rq); + // Do not unthrottle for an active CPU + if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask)) + return; + /* * The rq clock has already been updated in the * set_rq_offline(), so we should skip updating @@ -6724,18 +6638,20 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) continue; /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = 1; - /* * Offline rq is schedulable till CPU is completely disabled * in take_cpu_down(), so we prevent new cfs throttling here. */ cfs_rq->runtime_enabled = 0; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); + if (!cfs_rq_throttled(cfs_rq)) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = 1; + unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); @@ -6784,11 +6700,6 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) #else /* CONFIG_CFS_BANDWIDTH */ -static inline bool cfs_bandwidth_used(void) -{ - return false; -} - static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} @@ -6846,7 +6757,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) SCHED_WARN_ON(task_rq(p) != rq); - if (rq->cfs.h_nr_running > 1) { + if (rq->cfs.h_nr_queued > 1) { u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; u64 slice = se->slice; s64 delta = slice - ran; @@ -6934,7 +6845,7 @@ static inline void check_update_overutilized_status(struct rq *rq) { } /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) { - return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running && + return unlikely(rq->nr_running == rq->cfs.h_nr_idle && rq->nr_running); } @@ -6961,14 +6872,14 @@ requeue_delayed_entity(struct sched_entity *se) if (sched_feat(DELAY_ZERO)) { update_entity_lag(cfs_rq, se); if (se->vlag > 0) { - cfs_rq->nr_running--; + cfs_rq->nr_queued--; if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->vlag = 0; place_entity(cfs_rq, se, 0); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); - cfs_rq->nr_running++; + cfs_rq->nr_queued++; } } @@ -6986,10 +6897,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int idle_h_nr_running = task_has_idle_policy(p); - int h_nr_delayed = 0; + int h_nr_idle = task_has_idle_policy(p); + int h_nr_runnable = 1; int task_new = !(flags & ENQUEUE_WAKEUP); - int rq_h_nr_running = rq->cfs.h_nr_running; + int rq_h_nr_queued = rq->cfs.h_nr_queued; u64 slice = 0; /* @@ -7014,8 +6925,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); - if (task_new) - h_nr_delayed = !!se->sched_delayed; + if (task_new && se->sched_delayed) + h_nr_runnable = 0; for_each_sched_entity(se) { if (se->on_rq) { @@ -7037,12 +6948,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) enqueue_entity(cfs_rq, se, flags); slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; - cfs_rq->h_nr_delayed += h_nr_delayed; + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -7061,19 +6972,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) se->slice = slice; slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running++; - cfs_rq->idle_h_nr_running += idle_h_nr_running; - cfs_rq->h_nr_delayed += h_nr_delayed; + cfs_rq->h_nr_runnable += h_nr_runnable; + cfs_rq->h_nr_queued++; + cfs_rq->h_nr_idle += h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = 1; + h_nr_idle = 1; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; } - if (!rq_h_nr_running && rq->cfs.h_nr_running) { + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { /* Account for idle runtime */ if (!rq->nr_running) dl_server_update_idle_time(rq, rq->curr); @@ -7120,22 +7031,22 @@ static void set_next_buddy(struct sched_entity *se); static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { bool was_sched_idle = sched_idle_rq(rq); - int rq_h_nr_running = rq->cfs.h_nr_running; + int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; - int idle_h_nr_running = 0; - int h_nr_running = 0; - int h_nr_delayed = 0; + int h_nr_idle = 0; + int h_nr_queued = 0; + int h_nr_runnable = 0; struct cfs_rq *cfs_rq; u64 slice = 0; if (entity_is_task(se)) { p = task_of(se); - h_nr_running = 1; - idle_h_nr_running = task_has_idle_policy(p); - if (!task_sleep && !task_delayed) - h_nr_delayed = !!se->sched_delayed; + h_nr_queued = 1; + h_nr_idle = task_has_idle_policy(p); + if (task_sleep || task_delayed || !se->sched_delayed) + h_nr_runnable = 1; } else { cfs_rq = group_cfs_rq(se); slice = cfs_rq_min_slice(cfs_rq); @@ -7151,12 +7062,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) break; } - cfs_rq->h_nr_running -= h_nr_running; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; - cfs_rq->h_nr_delayed -= h_nr_delayed; + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_running; + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) @@ -7190,21 +7101,21 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) se->slice = slice; slice = cfs_rq_min_slice(cfs_rq); - cfs_rq->h_nr_running -= h_nr_running; - cfs_rq->idle_h_nr_running -= idle_h_nr_running; - cfs_rq->h_nr_delayed -= h_nr_delayed; + cfs_rq->h_nr_runnable -= h_nr_runnable; + cfs_rq->h_nr_queued -= h_nr_queued; + cfs_rq->h_nr_idle -= h_nr_idle; if (cfs_rq_is_idle(cfs_rq)) - idle_h_nr_running = h_nr_running; + h_nr_idle = h_nr_queued; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) return 0; } - sub_nr_running(rq, h_nr_running); + sub_nr_running(rq, h_nr_queued); - if (rq_h_nr_running && !rq->cfs.h_nr_running) + if (rq_h_nr_queued && !rq->cfs.h_nr_queued) dl_server_stop(&rq->fair_server); /* balance early to pull high priority tasks */ @@ -8893,7 +8804,7 @@ static struct task_struct *pick_task_fair(struct rq *rq) again: cfs_rq = &rq->cfs; - if (!cfs_rq->nr_running) + if (!cfs_rq->nr_queued) return NULL; do { @@ -9010,7 +8921,7 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) { - return !!dl_se->rq->cfs.nr_running; + return !!dl_se->rq->cfs.nr_queued; } static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) @@ -9341,43 +9252,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env) #ifdef CONFIG_NUMA_BALANCING /* - * Returns 1, if task migration degrades locality - * Returns 0, if task migration improves locality i.e migration preferred. - * Returns -1, if task migration is not affected by locality. + * Returns a positive value, if task migration degrades locality. + * Returns 0, if task migration is not affected by locality. + * Returns a negative value, if task migration improves locality i.e migration preferred. */ -static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { struct numa_group *numa_group = rcu_dereference(p->numa_group); unsigned long src_weight, dst_weight; int src_nid, dst_nid, dist; if (!static_branch_likely(&sched_numa_balancing)) - return -1; + return 0; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) - return -1; + return 0; src_nid = cpu_to_node(env->src_cpu); dst_nid = cpu_to_node(env->dst_cpu); if (src_nid == dst_nid) - return -1; + return 0; /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) { if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) return 1; else - return -1; + return 0; } /* Encourage migration to the preferred node. */ if (dst_nid == p->numa_preferred_nid) - return 0; + return -1; /* Leaving a core idle is often worse than degrading locality. */ if (env->idle == CPU_IDLE) - return -1; + return 0; dist = node_distance(src_nid, dst_nid); if (numa_group) { @@ -9388,37 +9299,77 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) dst_weight = task_weight(p, dst_nid, dist); } - return dst_weight < src_weight; + return src_weight - dst_weight; } #else -static inline int migrate_degrades_locality(struct task_struct *p, +static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { - return -1; + return 0; } #endif /* + * Check whether the task is ineligible on the destination cpu + * + * When the PLACE_LAG scheduling feature is enabled and + * dst_cfs_rq->nr_queued is greater than 1, if the task + * is ineligible, it will also be ineligible when + * it is migrated to the destination cpu. + */ +static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu) +{ + struct cfs_rq *dst_cfs_rq; + +#ifdef CONFIG_FAIR_GROUP_SCHED + dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu]; +#else + dst_cfs_rq = &cpu_rq(dest_cpu)->cfs; +#endif + if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued && + !entity_eligible(task_cfs_rq(p), &p->se)) + return 1; + + return 0; +} + +/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { - int tsk_cache_hot; + long degrades, hot; lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) + p->sched_task_hot = 0; /* * We do not migrate tasks that are: - * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU. + * 1) delayed dequeued unless we migrate load, or + * 2) throttled_lb_pair, or + * 3) cannot be migrated to this CPU due to cpus_ptr, or + * 4) running (obviously), or + * 5) are cache-hot on their current CPU. */ + if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) + return 0; + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; + /* + * We want to prioritize the migration of eligible tasks. + * For ineligible tasks we soft-limit them and only allow + * them to migrate when nr_balance_failed is non-zero to + * avoid load-balancing trying very hard to balance the load. + */ + if (!env->sd->nr_balance_failed && + task_is_ineligible_on_dst_cpu(p, env->dst_cpu)) + return 0; + /* Disregard percpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0; @@ -9474,16 +9425,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (env->flags & LBF_ACTIVE_LB) return 1; - tsk_cache_hot = migrate_degrades_locality(p, env); - if (tsk_cache_hot == -1) - tsk_cache_hot = task_hot(p, env); + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); + else + hot = degrades > 0; - if (tsk_cache_hot <= 0 || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot == 1) { - schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->stats.nr_forced_migrations); - } + if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { + if (hot) + p->sched_task_hot = 1; return 1; } @@ -9498,6 +9448,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_rq_held(env->src_rq); + if (p->sched_task_hot) { + p->sched_task_hot = 0; + schedstat_inc(env->sd->lb_hot_gained[env->idle]); + schedstat_inc(p->stats.nr_forced_migrations); + } + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -9658,6 +9614,9 @@ static int detach_tasks(struct lb_env *env) continue; next: + if (p->sched_task_hot) + schedstat_inc(p->stats.nr_failed_migrations_hot); + list_move(&p->se.group_node, tasks); } @@ -9800,7 +9759,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq); - if (cfs_rq->nr_running == 0) + if (cfs_rq->nr_queued == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); if (cfs_rq == &rq->cfs) @@ -10332,7 +10291,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * When there is more than 1 task, the group_overloaded case already * takes care of cpu with reduced capacity */ - if (rq->cfs.h_nr_running != 1) + if (rq->cfs.h_nr_runnable != 1) return false; return check_cpu_capacity(rq, sd); @@ -10354,7 +10313,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *sg_overloaded, bool *sg_overutilized) { - int i, nr_running, local_group; + int i, nr_running, local_group, sd_flags = env->sd->flags; + bool balancing_at_rd = !env->sd->parent; memset(sgs, 0, sizeof(*sgs)); @@ -10367,21 +10327,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_load += load; sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); - sgs->sum_h_nr_running += rq->cfs.h_nr_running; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable; nr_running = rq->nr_running; sgs->sum_nr_running += nr_running; - if (nr_running > 1) - *sg_overloaded = 1; - if (cpu_overutilized(i)) *sg_overutilized = 1; -#ifdef CONFIG_NUMA_BALANCING - sgs->nr_numa_running += rq->nr_numa_running; - sgs->nr_preferred_running += rq->nr_preferred_running; -#endif /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -10391,10 +10344,21 @@ static inline void update_sg_lb_stats(struct lb_env *env, continue; } + /* Overload indicator is only updated at root domain */ + if (balancing_at_rd && nr_running > 1) + *sg_overloaded = 1; + +#ifdef CONFIG_NUMA_BALANCING + /* Only fbq_classify_group() uses this to classify NUMA groups */ + if (sd_flags & SD_NUMA) { + sgs->nr_numa_running += rq->nr_numa_running; + sgs->nr_preferred_running += rq->nr_preferred_running; + } +#endif if (local_group) continue; - if (env->sd->flags & SD_ASYM_CPUCAPACITY) { + if (sd_flags & SD_ASYM_CPUCAPACITY) { /* Check for a misfit task on the cpu */ if (sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; @@ -10682,7 +10646,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, sgs->group_util += cpu_util_without(i, p); sgs->group_runnable += cpu_runnable_without(rq, p); local = task_running_on_cpu(i, p); - sgs->sum_h_nr_running += rq->cfs.h_nr_running - local; + sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local; nr_running = rq->nr_running - local; sgs->sum_nr_running += nr_running; @@ -11464,7 +11428,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, if (rt > env->fbq_type) continue; - nr_running = rq->cfs.h_nr_running; + nr_running = rq->cfs.h_nr_runnable; if (!nr_running) continue; @@ -11623,7 +11587,7 @@ static int need_active_balance(struct lb_env *env) * available on dst_cpu. */ if (env->idle && - (env->src_rq->cfs.h_nr_running == 1)) { + (env->src_rq->cfs.h_nr_runnable == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) return 1; @@ -11703,6 +11667,28 @@ static int should_we_balance(struct lb_env *env) return group_balance_cpu(sg) == env->dst_cpu; } +static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd, + enum cpu_idle_type idle) +{ + if (!schedstat_enabled()) + return; + + switch (env->migration_type) { + case migrate_load: + __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance); + break; + case migrate_util: + __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance); + break; + case migrate_task: + __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance); + break; + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; + } +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -11753,7 +11739,7 @@ redo: WARN_ON_ONCE(busiest == env.dst_rq); - schedstat_add(sd->lb_imbalance[idle], env.imbalance); + update_lb_imbalance_stat(&env, sd, idle); env.src_cpu = busiest->cpu; env.src_rq = busiest; @@ -12251,16 +12237,13 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set - * anywhere yet. */ static inline int find_new_ilb(void) { const struct cpumask *hk_mask; int ilb_cpu; - hk_mask = housekeeping_cpumask(HK_TYPE_MISC); + hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { @@ -12278,7 +12261,8 @@ static inline int find_new_ilb(void) * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU * SMP function call (IPI). * - * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). + * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set + * (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -12366,7 +12350,7 @@ static void nohz_balancer_kick(struct rq *rq) * If there's a runnable CFS task and the current CPU has reduced * capacity, kick the ILB to see if there's a better CPU to run on: */ - if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { + if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -12498,10 +12482,6 @@ void nohz_balance_enter_idle(int cpu) if (!cpu_active(cpu)) return; - /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) - return; - /* * Can be set safely without rq->lock held * If a clear happens, it will have evaluated last additions because @@ -12721,13 +12701,6 @@ static void nohz_newidle_balance(struct rq *this_rq) { int this_cpu = this_rq->cpu; - /* - * This CPU doesn't want to be disturbed by scheduler - * housekeeping - */ - if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) - return; - /* Will wake up very soon. No time for doing anything else*/ if (this_rq->avg_idle < sysctl_sched_migration_cost) return; @@ -12864,11 +12837,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) * have been enqueued in the meantime. Since we're not going idle, * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) + if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) + if (this_rq->nr_running != this_rq->cfs.h_nr_queued) pulled_task = -1; out: @@ -12889,9 +12862,9 @@ out: /* * This softirq handler is triggered via SCHED_SOFTIRQ from two places: * - * - directly from the local scheduler_tick() for periodic load balancing + * - directly from the local sched_tick() for periodic load balancing * - * - indirectly from a remote scheduler_tick() for NOHZ idle balancing + * - indirectly from a remote sched_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ static __latent_entropy void sched_balance_softirq(void) @@ -12982,7 +12955,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check * if we need to give up the CPU. */ - if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && + if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) resched_curr(rq); } @@ -13126,7 +13099,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->cfs.nr_running == 1) + if (rq->cfs.nr_queued == 1) return; /* @@ -13536,7 +13509,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg->se[i]; - struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; @@ -13547,16 +13520,8 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) goto next_cpu; - if (se->on_rq) { - parent_cfs_rq = cfs_rq_of(se); - if (cfs_rq_is_idle(grp_cfs_rq)) - parent_cfs_rq->idle_nr_running++; - else - parent_cfs_rq->idle_nr_running--; - } - - idle_task_delta = grp_cfs_rq->h_nr_running - - grp_cfs_rq->idle_h_nr_running; + idle_task_delta = grp_cfs_rq->h_nr_queued - + grp_cfs_rq->h_nr_idle; if (!cfs_rq_is_idle(grp_cfs_rq)) idle_task_delta *= -1; @@ -13566,7 +13531,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (!se->on_rq) break; - cfs_rq->idle_h_nr_running += idle_task_delta; + cfs_rq->h_nr_idle += idle_task_delta; /* Already accounted at parent level and above. */ if (cfs_rq_is_idle(cfs_rq)) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index a3d331dd2d8f..3c12d9f93331 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -32,6 +32,15 @@ SCHED_FEAT(PREEMPT_SHORT, true) SCHED_FEAT(NEXT_BUDDY, false) /* + * Allow completely ignoring cfs_rq->next; which can be set from various + * places: + * - NEXT_BUDDY (wakeup preemption) + * - yield_to_task() + * - cgroup dequeue / pick + */ +SCHED_FEAT(PICK_BUDDY, true) + +/* * Consider buddies to be cache hot, decreases the likeliness of a * cache buddy being migrated away, increases cache locality. */ diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 5891e715f00d..81bc8b329ef1 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -9,15 +9,9 @@ */ enum hk_flags { - HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), - HK_FLAG_RCU = BIT(HK_TYPE_RCU), - HK_FLAG_MISC = BIT(HK_TYPE_MISC), - HK_FLAG_SCHED = BIT(HK_TYPE_SCHED), - HK_FLAG_TICK = BIT(HK_TYPE_TICK), HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), - HK_FLAG_WQ = BIT(HK_TYPE_WQ), HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), - HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD), + HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE), }; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden); @@ -97,7 +91,7 @@ void __init housekeeping_init(void) static_branch_enable(&housekeeping_overridden); - if (housekeeping.flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_KERNEL_NOISE) sched_tick_offload_init(); for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) { @@ -121,7 +115,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) unsigned int first_cpu; int err = 0; - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) { if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { pr_warn("Housekeeping: nohz unsupported." " Build with CONFIG_NO_HZ_FULL\n"); @@ -177,7 +171,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) housekeeping_setup_type(type, housekeeping_staging); } - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) + if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) tick_nohz_full_setup(non_housekeeping_mask); housekeeping.flags |= flags; @@ -195,8 +189,7 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned long flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | - HK_FLAG_MISC | HK_FLAG_KTHREAD; + flags = HK_FLAG_KERNEL_NOISE; return housekeeping_setup(str, flags); } @@ -210,9 +203,12 @@ static int __init housekeeping_isolcpus_setup(char *str) int len; while (isalpha(*str)) { + /* + * isolcpus=nohz is equivalent to nohz_full. + */ if (!strncmp(str, "nohz,", 5)) { str += 5; - flags |= HK_FLAG_TICK; + flags |= HK_FLAG_KERNEL_NOISE; continue; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index fee75cc2c47b..7a8534a2deff 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) * * group: [ see update_cfs_group() ] * se_weight() = tg->weight * grq->load_avg / tg->load_avg - * se_runnable() = grq->h_nr_running + * se_runnable() = grq->h_nr_runnable * * runnable_sum = se_runnable() * runnable = grq->runnable_sum * runnable_avg = runnable_sum @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->h_nr_running - cfs_rq->h_nr_delayed, + cfs_rq->h_nr_runnable, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 84dad1511d1e..bb56805e3d47 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -998,7 +998,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st s64 delta; u64 irq; - if (static_branch_likely(&psi_disabled)) + if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) return; if (!curr->pid) @@ -1240,6 +1240,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (!irqtime_enabled() && res == PSI_IRQ) + return -EOPNOTSUPP; +#endif + /* Update averages before reporting them */ mutex_lock(&group->avgs_lock); now = sched_clock(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c5d67a43fe52..c7cf4cc57cdd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -362,7 +362,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern int dl_bw_check_overflow(int cpu); +extern int dl_bw_deactivate(int cpu); extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); /* * SCHED_DEADLINE supports servers (nested scheduling) with the following @@ -650,11 +650,10 @@ struct balance_callback { /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned int nr_running; - unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ - unsigned int idle_nr_running; /* SCHED_IDLE */ - unsigned int idle_h_nr_running; /* SCHED_IDLE */ - unsigned int h_nr_delayed; + unsigned int nr_queued; + unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */ + unsigned int h_nr_idle; /* SCHED_IDLE */ s64 avg_vruntime; u64 avg_load; @@ -904,11 +903,8 @@ struct dl_rq { static inline void se_update_runnable(struct sched_entity *se) { - if (!entity_is_task(se)) { - struct cfs_rq *cfs_rq = se->my_q; - - se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed; - } + if (!entity_is_task(se)) + se->runnable_weight = se->my_q->h_nr_runnable; } static inline long se_runnable(struct sched_entity *se) @@ -2280,7 +2276,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p) static inline int task_on_rq_queued(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_QUEUED; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED; } static inline int task_on_rq_migrating(struct task_struct *p) @@ -2574,7 +2570,7 @@ static inline bool sched_rt_runnable(struct rq *rq) static inline bool sched_fair_runnable(struct rq *rq) { - return rq->cfs.nr_running > 0; + return rq->cfs.nr_queued > 0; } extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); @@ -3242,6 +3238,12 @@ struct irqtime { }; DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime); + +static inline int irqtime_enabled(void) +{ + return static_branch_likely(&sched_clock_irqtime); +} /* * Returns the irqtime minus the softirq time computed by ksoftirqd. @@ -3262,6 +3264,13 @@ static inline u64 irq_time_read(int cpu) return total; } +#else + +static inline int irqtime_enabled(void) +{ + return 0; +} + #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -3509,6 +3518,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned #endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */ +extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr); + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index eb0cdcd4d921..4346fd81c31f 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -103,7 +103,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, * Bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 16 +#define SCHEDSTAT_VERSION 17 static int show_schedstat(struct seq_file *seq, void *v) { @@ -138,14 +138,17 @@ static int show_schedstat(struct seq_file *seq, void *v) for_each_domain(cpu, sd) { enum cpu_idle_type itype; - seq_printf(seq, "domain%d %*pb", dcount++, + seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name, cpumask_pr_args(sched_domain_span(sd))); for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %u %u %u %u %u %u %u %u", + seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u", sd->lb_count[itype], sd->lb_balanced[itype], sd->lb_failed[itype], - sd->lb_imbalance[itype], + sd->lb_imbalance_load[itype], + sd->lb_imbalance_util[itype], + sd->lb_imbalance_task[itype], + sd->lb_imbalance_misfit[itype], sd->lb_gained[itype], sd->lb_hot_gained[itype], sd->lb_nobusyq[itype], diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8ee0add5a48a..6ade91bce63e 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -138,6 +138,10 @@ static inline void psi_enqueue(struct task_struct *p, int flags) if (flags & ENQUEUE_RESTORE) return; + /* psi_sched_switch() will handle the flags */ + if (task_on_cpu(task_rq(p), p)) + return; + if (p->se.sched_delayed) { /* CPU migration of "sleeping" task */ SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index ff0e5ab4e37c..149e2c8036d3 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -300,20 +300,10 @@ static void __setscheduler_params(struct task_struct *p, p->policy = policy; - if (dl_policy(policy)) { + if (dl_policy(policy)) __setparam_dl(p, attr); - } else if (fair_policy(policy)) { - p->static_prio = NICE_TO_PRIO(attr->sched_nice); - if (attr->sched_runtime) { - p->se.custom_slice = 1; - p->se.slice = clamp_t(u64, attr->sched_runtime, - NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ - NSEC_PER_MSEC*100); /* HZ=100 / 10 */ - } else { - p->se.custom_slice = 0; - p->se.slice = sysctl_sched_base_slice; - } - } + else if (fair_policy(policy)) + __setparam_fair(p, attr); /* rt-policy tasks do not have a timerslack */ if (rt_or_dl_task_policy(p)) { @@ -1433,7 +1423,7 @@ int __sched yield_to(struct task_struct *p, bool preempt) struct rq *rq, *p_rq; int yielded = 0; - scoped_guard (irqsave) { + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { rq = this_rq(); again: diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9748a4c8d668..da33ec9e94ab 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1635,9 +1635,7 @@ sd_init(struct sched_domain_topology_level *tl, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -#ifdef CONFIG_SCHED_DEBUG .name = tl->name, -#endif }; sd_span = sched_domain_span(sd); @@ -2338,10 +2336,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { pr_err("BUG: arch topology borken\n"); -#ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); -#endif /* Fixup, ensure @sd has at least @child CPUs. */ cpumask_or(sched_domain_span(sd), sched_domain_span(sd), @@ -2721,9 +2717,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], /* * This domain won't be destroyed and as such - * its dl_bw->total_bw needs to be cleared. It - * will be recomputed in function - * update_tasks_root_domain(). + * its dl_bw->total_bw needs to be cleared. + * Tasks contribution will be then recomputed + * in function dl_update_tasks_root_domain(), + * dl_servers contribution in function + * dl_restore_server_root_domain(). */ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; dl_clear_root_domain(rd); diff --git a/kernel/signal.c b/kernel/signal.c index 989b1cc9116a..a2afd54303f0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2007,11 +2007,22 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr) if (!list_empty(&q->list)) { /* - * If task group is exiting with the signal already pending, - * wait for __exit_signal() to do its job. Otherwise if - * ignored, it's not supposed to be queued. Try to survive. + * The signal was ignored and blocked. The timer + * expiry queued it because blocked signals are + * queued independent of the ignored state. + * + * The unblocking set SIGPENDING, but the signal + * was not yet dequeued from the pending list. + * So prepare_signal() sees unblocked and ignored, + * which ends up here. Leave it queued like a + * regular signal. + * + * The same happens when the task group is exiting + * and the signal is already queued. + * prepare_signal() treats SIGNAL_GROUP_EXIT as + * ignored independent of its queued state. This + * gets cleaned up in __exit_signal(). */ - WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT)); goto out; } @@ -2046,17 +2057,25 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr) goto out; } - /* This should never happen and leaks a reference count */ - if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list))) - hlist_del_init(&tmr->ignored_list); - if (unlikely(!list_empty(&q->list))) { /* This holds a reference count already */ result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } - posixtimer_sigqueue_getref(q); + /* + * If the signal is on the ignore list, it got blocked after it was + * ignored earlier. But nothing lifted the ignore. Move it back to + * the pending list to be consistent with the regular signal + * handling. This already holds a reference count. + * + * If it's not on the ignore list acquire a reference count. + */ + if (likely(hlist_unhashed(&tmr->ignored_list))) + posixtimer_sigqueue_getref(q); + else + hlist_del_init(&tmr->ignored_list); + posixtimer_queue_sigqueue(q, t, tmr->it_pid_type); result = TRACE_SIGNAL_DELIVERED; out: diff --git a/kernel/smp.c b/kernel/smp.c index 27dc31a146a3..f104c8e83fc4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -815,7 +815,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask, WARN_ON_ONCE(!in_task()); /* Check if we need local execution. */ - if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask)) + if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && + (!cond_func || cond_func(this_cpu, info))) run_local = true; /* Check if we need remote execution, i.e., any CPU excluding this one. */ @@ -868,7 +869,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, send_call_function_ipi_mask(cfd->cpumask_ipi); } - if (run_local && (!cond_func || cond_func(this_cpu, info))) { + if (run_local) { unsigned long flags; local_irq_save(flags); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5c9202cb8f59..7ae7a4136855 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1804,15 +1804,6 @@ static struct ctl_table kern_table[] = { }, #endif { - .procname = "pid_max", - .data = &pid_max, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &pid_max_min, - .extra2 = &pid_max_max, - }, - { .procname = "panic_on_oops", .data = &panic_on_oops, .maxlen = sizeof(int), diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 62e73444ffe4..38dae590b29f 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -137,7 +137,8 @@ static int wdtest_func(void *arg) udelay(1); j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); - WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC)); + WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC), + "Expected at least 1000ns, got %lu.\n", j2 - j1); /* Verify tsc-like stability with various numbers of errors injected. */ max_retries = clocksource_get_max_watchdog_retry(); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 80fe3749d2db..f6d8df94045c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1067,11 +1067,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * - * Returns 1 when the new timer is the leftmost timer in the tree. + * Returns true when the new timer is the leftmost timer in the tree. */ -static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - enum hrtimer_mode mode) +static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); @@ -2202,6 +2201,15 @@ int hrtimers_prepare_cpu(unsigned int cpu) } cpu_base->cpu = cpu; + hrtimer_cpu_base_init_expiry_lock(cpu_base); + return 0; +} + +int hrtimers_cpu_starting(unsigned int cpu) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + + /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; @@ -2210,7 +2218,6 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; - hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -2286,5 +2293,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); + hrtimers_cpu_starting(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 881a9ce96af7..1b675aee99a9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -538,7 +538,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * When the reference count reaches zero, the timer is scheduled * for RCU removal after the grace period. * - * Holding rcu_read_lock() accross the lookup ensures that + * Holding rcu_read_lock() across the lookup ensures that * the timer cannot be freed. * * The lookup validates locklessly that timr::it_signal == diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ed58eebb4e8f..0207868c8b4d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1020,6 +1020,8 @@ static inline ktime_t tick_get_next_period(void) /** * tick_broadcast_setup_oneshot - setup the broadcast device + * @bc: the broadcast device + * @from_periodic: true if called from periodic mode */ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3d128825d343..1e67d076f195 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -485,91 +485,30 @@ u64 notrace ktime_get_tai_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); -static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + * + * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. + */ +u64 ktime_get_real_fast_ns(void) { + struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; - u64 basem, baser, delta; + u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); - if (mono) - *mono = basem + delta; return baser + delta; } - -/** - * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. - * - * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. - */ -u64 ktime_get_real_fast_ns(void) -{ - return __ktime_get_real_fast(&tk_fast_mono, NULL); -} EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** - * ktime_get_fast_timestamps: - NMI safe timestamps - * @snapshot: Pointer to timestamp storage - * - * Stores clock monotonic, boottime and realtime timestamps. - * - * Boot time is a racy access on 32bit systems if the sleep time injection - * happens late during resume and not in timekeeping_resume(). That could - * be avoided by expanding struct tk_read_base with boot offset for 32bit - * and adding more overhead to the update. As this is a hard to observe - * once per resume event which can be filtered with reasonable effort using - * the accurate mono/real timestamps, it's probably not worth the trouble. - * - * Aside of that it might be possible on 32 and 64 bit to observe the - * following when the sleep time injection happens late: - * - * CPU 0 CPU 1 - * timekeeping_resume() - * ktime_get_fast_timestamps() - * mono, real = __ktime_get_real_fast() - * inject_sleep_time() - * update boot offset - * boot = mono + bootoffset; - * - * That means that boot time already has the sleep time adjustment, but - * real time does not. On the next readout both are in sync again. - * - * Preventing this for 64bit is not really feasible without destroying the - * careful cache layout of the timekeeper because the sequence count and - * struct tk_read_base would then need two cache lines instead of one. - * - * Access to the time keeper clock source is disabled across the innermost - * steps of suspend/resume. The accessors still work, but the timestamps - * are frozen until time keeping is resumed which happens very early. - * - * For regular suspend/resume there is no observable difference vs. sched - * clock, but it might affect some of the nasty low level debug printks. - * - * OTOH, access to sched clock is not guaranteed across suspend/resume on - * all systems either so it depends on the hardware in use. - * - * If that turns out to be a real problem then this could be mitigated by - * using sched clock in a similar way as during early boot. But it's not as - * trivial as on early boot because it needs some careful protection - * against the clock monotonic timestamp jumping backwards on resume. - */ -void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); - snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); -} - -/** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5860bf6d16f..40706cb36920 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -956,33 +956,29 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; - struct timer_base *base; - - base = per_cpu_ptr(&timer_bases[index], cpu); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); - return base; + index = BASE_DEF; + + return per_cpu_ptr(&timer_bases[index], cpu); } static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; - struct timer_base *base; - - base = this_cpu_ptr(&timer_bases[index]); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = this_cpu_ptr(&timer_bases[BASE_DEF]); - return base; + index = BASE_DEF; + + return this_cpu_ptr(&timer_bases[index]); } static inline struct timer_base *get_timer_base(u32 tflags) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 8d57f7686bb0..9cb9b6584ea1 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, break; child = group; - group = group->parent; + /* + * Pairs with the store release on group connection + * to make sure group initialization is visible. + */ + group = READ_ONCE(group->parent); data->childmask = child->groupmask; + WARN_ON_ONCE(!data->childmask); } while (group); } @@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group) while ((node = timerqueue_getnext(&group->events))) { evt = container_of(node, struct tmigr_event, nextevt); - if (!evt->ignore) { + if (!READ_ONCE(evt->ignore)) { WRITE_ONCE(group->next_expiry, evt->nextevt.expires); return evt; } @@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group, * lock is held while updating the ignore flag in idle path. So this * state change will not be lost. */ - group->groupevt.ignore = true; + WRITE_ONCE(group->groupevt.ignore, true); return walk_done; } @@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, union tmigr_state childstate, groupstate; bool remote = data->remote; bool walk_done = false; + bool ignore; u64 nextexp; if (child) { @@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, nextexp = child->next_expiry; evt = &child->groupevt; - evt->ignore = (nextexp == KTIME_MAX) ? true : false; + /* + * This can race with concurrent idle exit (activate). + * If the current writer wins, a useless remote expiration may + * be scheduled. If the activate wins, the event is properly + * ignored. + */ + ignore = (nextexp == KTIME_MAX) ? true : false; + WRITE_ONCE(evt->ignore, ignore); } else { nextexp = data->nextexp; first_childevt = evt = data->evt; + ignore = evt->ignore; /* * Walking the hierarchy is required in any case when a @@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * first event information of the group is updated properly and * also handled properly, so skip this fast return path. */ - if (evt->ignore && !remote && group->parent) + if (ignore && !remote && group->parent) return true; raw_spin_lock(&group->lock); @@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * queue when the expiry time changed only or when it could be ignored. */ if (timerqueue_node_queued(&evt->nextevt)) { - if ((evt->nextevt.expires == nextexp) && !evt->ignore) { + if ((evt->nextevt.expires == nextexp) && !ignore) { /* Make sure not to miss a new CPU event with the same expiry */ evt->cpu = first_childevt->cpu; goto check_toplvl; @@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, WRITE_ONCE(group->next_expiry, KTIME_MAX); } - if (evt->ignore) { + if (ignore) { /* * When the next child event could be ignored (nextexp is * KTIME_MAX) and there was no remote timer handling before or @@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, s.seq = 0; atomic_set(&group->migr_state, s.state); + /* + * If this is a new top-level, prepare its groupmask in advance. + * This avoids accidents where yet another new top-level is + * created in the future and made visible before the current groupmask. + */ + if (list_empty(&tmigr_level_list[lvl])) { + group->groupmask = BIT(0); + /* + * The previous top level has prepared its groupmask already, + * simply account it as the first child. + */ + if (lvl > 0) + group->num_children = 1; + } + timerqueue_init_head(&group->events); timerqueue_init(&group->groupevt.nextevt); group->groupevt.nextevt.expires = KTIME_MAX; @@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, raw_spin_lock_irq(&child->lock); raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); - child->parent = parent; - child->groupmask = BIT(parent->num_children++); + if (activate) { + /* + * @child is the old top and @parent the new one. In this + * case groupmask is pre-initialized and @child already + * accounted, along with its new sibling corresponding to the + * CPU going up. + */ + WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); + } else { + /* Adding @child for the CPU going up to @parent. */ + child->groupmask = BIT(parent->num_children++); + } + + /* + * Make sure parent initialization is visible before publishing it to a + * racing CPU entering/exiting idle. This RELEASE barrier enforces an + * address dependency that pairs with the READ_ONCE() in __walk_groups(). + */ + smp_store_release(&child->parent, parent); raw_spin_unlock(&parent->lock); raw_spin_unlock_irq(&child->lock); @@ -1624,9 +1670,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) * be different from tmigr_hierarchy_levels, contains only a * single group. */ - if (group->parent || i == tmigr_hierarchy_levels || - (list_empty(&tmigr_level_list[i]) && - list_is_singular(&tmigr_level_list[i - 1]))) + if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) break; } while (i < tmigr_hierarchy_levels); diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 154accc7a543..ae19f70f8170 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -110,22 +110,19 @@ struct tmigr_cpu { * union tmigr_state - state of tmigr_group * @state: Combined version of the state - only used for atomic * read/cmpxchg function - * @struct: Split version of the state - only use the struct members to + * &anon struct: Split version of the state - only use the struct members to * update information to stay independent of endianness + * @active: Contains each mask bit of the active children + * @migrator: Contains mask of the child which is migrator + * @seq: Sequence counter needs to be increased when an update + * to the tmigr_state is done. It prevents a race when + * updates in the child groups are propagated in changed + * order. Detailed information about the scenario is + * given in the documentation at the begin of + * timer_migration.c. */ union tmigr_state { u32 state; - /** - * struct - split state of tmigr_group - * @active: Contains each mask bit of the active children - * @migrator: Contains mask of the child which is migrator - * @seq: Sequence counter needs to be increased when an update - * to the tmigr_state is done. It prevents a race when - * updates in the child groups are propagated in changed - * order. Detailed information about the scenario is - * given in the documentation at the begin of - * timer_migration.c. - */ struct { u8 active; u8 migrator; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 74c2b1d43bb9..d570b8b9c0a9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,9 +31,14 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.rst -config HAVE_FUNCTION_GRAPH_RETVAL +config HAVE_FUNCTION_GRAPH_FREGS bool +config HAVE_FTRACE_GRAPH_FUNC + bool + help + True if ftrace_graph_func() is defined. + config HAVE_DYNAMIC_FTRACE bool help @@ -57,6 +62,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS This allows for use of ftrace_regs_get_argument() and ftrace_regs_get_stack_pointer(). +config HAVE_FTRACE_REGS_HAVING_PT_REGS + bool + help + If this is set, ftrace_regs has pt_regs, thus it can convert to + pt_regs without allocating memory. + config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE bool help @@ -232,7 +243,7 @@ config FUNCTION_GRAPH_TRACER config FUNCTION_GRAPH_RETVAL bool "Kernel Function Graph Return Value" - depends on HAVE_FUNCTION_GRAPH_RETVAL + depends on HAVE_FUNCTION_GRAPH_FREGS depends on FUNCTION_GRAPH_TRACER default n help @@ -296,10 +307,9 @@ config DYNAMIC_FTRACE_WITH_ARGS config FPROBE bool "Kernel Function Probe (fprobe)" - depends on FUNCTION_TRACER - depends on DYNAMIC_FTRACE_WITH_REGS - depends on HAVE_RETHOOK - select RETHOOK + depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC + depends on DYNAMIC_FTRACE_WITH_ARGS + select FUNCTION_GRAPH_TRACER default n help This option enables kernel function probe (fprobe) based on ftrace. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8fd292d34d89..3679a6d18934 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -617,8 +617,9 @@ err: return ret; } -static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, char __user *arg) +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) { struct blk_user_trace_setup buts; int ret; @@ -627,29 +628,18 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return -EFAULT; + mutex_lock(&q->debugfs_mutex); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + mutex_unlock(&q->debugfs_mutex); if (ret) return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { - __blk_trace_remove(q); + blk_trace_remove(q); return -EFAULT; } return 0; } - -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - char __user *arg) -{ - int ret; - - mutex_lock(&q->debugfs_mutex); - ret = __blk_trace_setup(q, name, dev, bdev, arg); - mutex_unlock(&q->debugfs_mutex); - - return ret; -} EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) @@ -673,12 +663,14 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .pid = cbuts.pid, }; + mutex_lock(&q->debugfs_mutex); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + mutex_unlock(&q->debugfs_mutex); if (ret) return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { - __blk_trace_remove(q); + blk_trace_remove(q); return -EFAULT; } @@ -732,12 +724,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) int ret, start = 0; char b[BDEVNAME_SIZE]; - mutex_lock(&q->debugfs_mutex); - switch (cmd) { case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); - ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: @@ -749,17 +739,15 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) start = 1; fallthrough; case BLKTRACESTOP: - ret = __blk_trace_startstop(q, start); + ret = blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: - ret = __blk_trace_remove(q); + ret = blk_trace_remove(q); break; default: ret = -ENOTTY; break; } - - mutex_unlock(&q->debugfs_mutex); return ret; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1b8db5aee9d3..c462aca8b7e6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -619,7 +619,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_sample_data *sd) + u64 flags, struct perf_raw_record *raw, + struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); @@ -644,6 +645,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; + perf_sample_save_raw_data(sd, event, raw); + return perf_event_output(event, sd, regs); } @@ -687,9 +690,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, } perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - err = __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_trace_nest_level); preempt_enable(); @@ -748,9 +750,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, perf_fetch_caller_regs(regs); perf_sample_data_init(sd, 0, 0); - perf_sample_save_raw_data(sd, &raw); - ret = __bpf_perf_event_output(regs, map, flags, sd); + ret = __bpf_perf_event_output(regs, map, flags, &raw, sd); out: this_cpu_dec(bpf_event_output_nest_level); preempt_enable(); @@ -2584,6 +2585,20 @@ struct user_syms { char *buf; }; +#ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS +static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs); +#define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs) +#else +#define bpf_kprobe_multi_pt_regs_ptr() (NULL) +#endif + +static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip) +{ + unsigned long ip = ftrace_get_symaddr(fentry_ip); + + return ip ? : fentry_ip; +} + static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt) { unsigned long __user usymbol; @@ -2778,7 +2793,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) static int kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, - unsigned long entry_ip, struct pt_regs *regs, + unsigned long entry_ip, struct ftrace_regs *fregs, bool is_return, void *data) { struct bpf_kprobe_multi_run_ctx run_ctx = { @@ -2790,6 +2805,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, .entry_ip = entry_ip, }; struct bpf_run_ctx *old_run_ctx; + struct pt_regs *regs; int err; if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { @@ -2800,6 +2816,7 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, migrate_disable(); rcu_read_lock(); + regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr()); old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx); err = bpf_prog_run(link->link.prog, regs); bpf_reset_run_ctx(old_run_ctx); @@ -2813,26 +2830,28 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, static int kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; int err; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data); + err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), + fregs, false, data); return is_kprobe_session(link->link.prog) ? err : 0; } static void kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *data) { struct bpf_kprobe_multi_link *link; link = container_of(fp, struct bpf_kprobe_multi_link, fp); - kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data); + kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip), + fregs, true, data); } static int symbols_cmp_r(const void *a, const void *b, const void *priv) diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 30e3ddc8a8a8..9e6b5a71555b 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -292,13 +292,15 @@ static inline unsigned long make_data_type_val(int idx, int size, int offset) } /* ftrace_graph_entry set to this to tell some archs to run function graph */ -static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops) +static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops, + struct ftrace_regs *fregs) { return 0; } /* ftrace_graph_return set to this to tell some archs to run function graph */ -static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops) +static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops, + struct ftrace_regs *fregs) { } @@ -520,13 +522,15 @@ int __weak ftrace_disable_ftrace_graph_caller(void) #endif int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { return 0; } static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { } @@ -644,14 +648,20 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, #endif /* If the caller does not use ftrace, call this function. */ -int function_graph_enter(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp) +int function_graph_enter_regs(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp, + struct ftrace_regs *fregs) { struct ftrace_graph_ent trace; unsigned long bitmap = 0; int offset; + int bit; int i; + bit = ftrace_test_recursion_trylock(func, ret); + if (bit < 0) + return -EBUSY; + trace.func = func; trace.depth = ++current->curr_ret_depth; @@ -663,7 +673,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, if (static_branch_likely(&fgraph_do_direct)) { int save_curr_ret_stack = current->curr_ret_stack; - if (static_call(fgraph_func)(&trace, fgraph_direct_gops)) + if (static_call(fgraph_func)(&trace, fgraph_direct_gops, fregs)) bitmap |= BIT(fgraph_direct_gops->idx); else /* Clear out any saved storage */ @@ -681,7 +691,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, save_curr_ret_stack = current->curr_ret_stack; if (ftrace_ops_test(&gops->ops, func, NULL) && - gops->entryfunc(&trace, gops)) + gops->entryfunc(&trace, gops, fregs)) bitmap |= BIT(i); else /* Clear out any saved storage */ @@ -697,12 +707,13 @@ int function_graph_enter(unsigned long ret, unsigned long func, * flag, set that bit always. */ set_bitmap(current, offset, bitmap | BIT(0)); - + ftrace_test_recursion_unlock(bit); return 0; out_ret: current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1; out: current->curr_ret_depth--; + ftrace_test_recursion_unlock(bit); return -EBUSY; } @@ -792,15 +803,12 @@ static struct notifier_block ftrace_suspend_notifier = { .notifier_call = ftrace_suspend_notifier_call, }; -/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */ -struct fgraph_ret_regs; - /* * Send the trace to the ring-buffer. * @return the original return address. */ -static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs, - unsigned long frame_pointer) +static inline unsigned long +__ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointer) { struct ftrace_ret_stack *ret_stack; struct ftrace_graph_ret trace; @@ -819,8 +827,11 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs } trace.rettime = trace_clock_local(); + if (fregs) + ftrace_regs_set_instruction_pointer(fregs, ret); + #ifdef CONFIG_FUNCTION_GRAPH_RETVAL - trace.retval = fgraph_ret_regs_return_value(ret_regs); + trace.retval = ftrace_regs_get_return_value(fregs); #endif bitmap = get_bitmap_bits(current, offset); @@ -828,7 +839,7 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs #ifdef CONFIG_HAVE_STATIC_CALL if (static_branch_likely(&fgraph_do_direct)) { if (test_bit(fgraph_direct_gops->idx, &bitmap)) - static_call(fgraph_retfunc)(&trace, fgraph_direct_gops); + static_call(fgraph_retfunc)(&trace, fgraph_direct_gops, fregs); } else #endif { @@ -838,7 +849,7 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs if (gops == &fgraph_stub) continue; - gops->retfunc(&trace, gops); + gops->retfunc(&trace, gops, fregs); } } @@ -855,14 +866,14 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs } /* - * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can - * leave only ftrace_return_to_handler(ret_regs). + * After all architecures have selected HAVE_FUNCTION_GRAPH_FREGS, we can + * leave only ftrace_return_to_handler(fregs). */ -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL -unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs) +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS +unsigned long ftrace_return_to_handler(struct ftrace_regs *fregs) { - return __ftrace_return_to_handler(ret_regs, - fgraph_ret_regs_frame_pointer(ret_regs)); + return __ftrace_return_to_handler(fregs, + ftrace_regs_get_frame_pointer(fregs)); } #else unsigned long ftrace_return_to_handler(unsigned long frame_pointer) @@ -1010,7 +1021,8 @@ void ftrace_graph_sleep_time_control(bool enable) * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h */ -void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops); +void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); /* The callbacks that hook a function */ trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph; @@ -1174,7 +1186,8 @@ void ftrace_graph_exit_task(struct task_struct *t) #ifdef CONFIG_DYNAMIC_FTRACE static int fgraph_pid_func(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = gops->ops.private; int pid; @@ -1188,7 +1201,7 @@ static int fgraph_pid_func(struct ftrace_graph_ent *trace, return 0; } - return gops->saved_func(trace, gops); + return gops->saved_func(trace, gops, fregs); } void fgraph_update_pid_func(void) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 9ff018245840..2560b312ad57 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -8,98 +8,224 @@ #include <linux/fprobe.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> -#include <linux/rethook.h> +#include <linux/list.h> +#include <linux/mutex.h> #include <linux/slab.h> #include <linux/sort.h> +#include <asm/fprobe.h> + #include "trace.h" -struct fprobe_rethook_node { - struct rethook_node node; - unsigned long entry_ip; - unsigned long entry_parent_ip; - char data[]; -}; +#define FPROBE_IP_HASH_BITS 8 +#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS) -static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) -{ - struct fprobe_rethook_node *fpr; - struct rethook_node *rh = NULL; - struct fprobe *fp; - void *entry_data = NULL; - int ret = 0; +#define FPROBE_HASH_BITS 6 +#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS) - fp = container_of(ops, struct fprobe, ops); +#define SIZE_IN_LONG(x) ((x + sizeof(long) - 1) >> (sizeof(long) == 8 ? 3 : 2)) - if (fp->exit_handler) { - rh = rethook_try_get(fp->rethook); - if (!rh) { - fp->nmissed++; - return; - } - fpr = container_of(rh, struct fprobe_rethook_node, node); - fpr->entry_ip = ip; - fpr->entry_parent_ip = parent_ip; - if (fp->entry_data_size) - entry_data = fpr->data; +/* + * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still + * exists. The key is the address of fprobe instance. + * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe + * instance related to the funciton address. The key is the ftrace IP + * address. + * + * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp + * are set NULL and delete those from both hash tables (by hlist_del_rcu). + * After an RCU grace period, the fprobe_hlist itself will be released. + * + * fprobe_table and fprobe_ip_table can be accessed from either + * - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held. + * - RCU hlist traversal under disabling preempt + */ +static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE]; +static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE]; +static DEFINE_MUTEX(fprobe_mutex); + +/* + * Find first fprobe in the hlist. It will be iterated twice in the entry + * probe, once for correcting the total required size, the second time is + * calling back the user handlers. + * Thus the hlist in the fprobe_table must be sorted and new probe needs to + * be added *before* the first fprobe. + */ +static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip) +{ + struct fprobe_hlist_node *node; + struct hlist_head *head; + + head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; + hlist_for_each_entry_rcu(node, head, hlist, + lockdep_is_held(&fprobe_mutex)) { + if (node->addr == ip) + return node; } + return NULL; +} +NOKPROBE_SYMBOL(find_first_fprobe_node); - if (fp->entry_handler) - ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data); +/* Node insertion and deletion requires the fprobe_mutex */ +static void insert_fprobe_node(struct fprobe_hlist_node *node) +{ + unsigned long ip = node->addr; + struct fprobe_hlist_node *next; + struct hlist_head *head; - /* If entry_handler returns !0, nmissed is not counted. */ - if (rh) { - if (ret) - rethook_recycle(rh); - else - rethook_hook(rh, ftrace_get_regs(fregs), true); + lockdep_assert_held(&fprobe_mutex); + + next = find_first_fprobe_node(ip); + if (next) { + hlist_add_before_rcu(&node->hlist, &next->hlist); + return; } + head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)]; + hlist_add_head_rcu(&node->hlist, head); } -static void fprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +/* Return true if there are synonims */ +static bool delete_fprobe_node(struct fprobe_hlist_node *node) { - struct fprobe *fp; - int bit; + lockdep_assert_held(&fprobe_mutex); - fp = container_of(ops, struct fprobe, ops); - if (fprobe_disabled(fp)) - return; + WRITE_ONCE(node->fp, NULL); + hlist_del_rcu(&node->hlist); + return !!find_first_fprobe_node(node->addr); +} - /* recursion detection has to go before any traceable function and - * all functions before this point should be marked as notrace - */ - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) { - fp->nmissed++; - return; +/* Check existence of the fprobe */ +static bool is_fprobe_still_exist(struct fprobe *fp) +{ + struct hlist_head *head; + struct fprobe_hlist *fph; + + head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)]; + hlist_for_each_entry_rcu(fph, head, hlist, + lockdep_is_held(&fprobe_mutex)) { + if (fph->fp == fp) + return true; } - __fprobe_handler(ip, parent_ip, ops, fregs); - ftrace_test_recursion_unlock(bit); + return false; +} +NOKPROBE_SYMBOL(is_fprobe_still_exist); + +static int add_fprobe_hash(struct fprobe *fp) +{ + struct fprobe_hlist *fph = fp->hlist_array; + struct hlist_head *head; + + lockdep_assert_held(&fprobe_mutex); + + if (WARN_ON_ONCE(!fph)) + return -EINVAL; + + if (is_fprobe_still_exist(fp)) + return -EEXIST; + + head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)]; + hlist_add_head_rcu(&fp->hlist_array->hlist, head); + return 0; +} + +static int del_fprobe_hash(struct fprobe *fp) +{ + struct fprobe_hlist *fph = fp->hlist_array; + lockdep_assert_held(&fprobe_mutex); + + if (WARN_ON_ONCE(!fph)) + return -EINVAL; + + if (!is_fprobe_still_exist(fp)) + return -ENOENT; + + fph->fp = NULL; + hlist_del_rcu(&fph->hlist); + return 0; } -NOKPROBE_SYMBOL(fprobe_handler); -static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct ftrace_regs *fregs) +#ifdef ARCH_DEFINE_ENCODE_FPROBE_HEADER + +/* The arch should encode fprobe_header info into one unsigned long */ +#define FPROBE_HEADER_SIZE_IN_LONG 1 + +static inline bool write_fprobe_header(unsigned long *stack, + struct fprobe *fp, unsigned int size_words) { + if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD || + !arch_fprobe_header_encodable(fp))) + return false; + + *stack = arch_encode_fprobe_header(fp, size_words); + return true; +} + +static inline void read_fprobe_header(unsigned long *stack, + struct fprobe **fp, unsigned int *size_words) +{ + *fp = arch_decode_fprobe_header_fp(*stack); + *size_words = arch_decode_fprobe_header_size(*stack); +} + +#else + +/* Generic fprobe_header */ +struct __fprobe_header { struct fprobe *fp; - int bit; + unsigned long size_words; +} __packed; - fp = container_of(ops, struct fprobe, ops); - if (fprobe_disabled(fp)) - return; +#define FPROBE_HEADER_SIZE_IN_LONG SIZE_IN_LONG(sizeof(struct __fprobe_header)) - /* recursion detection has to go before any traceable function and - * all functions called before this point should be marked as notrace - */ - bit = ftrace_test_recursion_trylock(ip, parent_ip); - if (bit < 0) { - fp->nmissed++; - return; - } +static inline bool write_fprobe_header(unsigned long *stack, + struct fprobe *fp, unsigned int size_words) +{ + struct __fprobe_header *fph = (struct __fprobe_header *)stack; + + if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD)) + return false; + + fph->fp = fp; + fph->size_words = size_words; + return true; +} + +static inline void read_fprobe_header(unsigned long *stack, + struct fprobe **fp, unsigned int *size_words) +{ + struct __fprobe_header *fph = (struct __fprobe_header *)stack; + + *fp = fph->fp; + *size_words = fph->size_words; +} + +#endif + +/* + * fprobe shadow stack management: + * Since fprobe shares a single fgraph_ops, it needs to share the stack entry + * among the probes on the same function exit. Note that a new probe can be + * registered before a target function is returning, we can not use the hash + * table to find the corresponding probes. Thus the probe address is stored on + * the shadow stack with its entry data size. + * + */ +static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct fprobe *fp, struct ftrace_regs *fregs, + void *data) +{ + if (!fp->entry_handler) + return 0; + + return fp->entry_handler(fp, ip, parent_ip, fregs, data); +} +static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, + struct fprobe *fp, struct ftrace_regs *fregs, + void *data) +{ + int ret; /* * This user handler is shared with other kprobes and is not expected to be * called recursively. So if any other kprobe handler is running, this will @@ -108,44 +234,183 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, */ if (unlikely(kprobe_running())) { fp->nmissed++; - goto recursion_unlock; + return 0; } kprobe_busy_begin(); - __fprobe_handler(ip, parent_ip, ops, fregs); + ret = __fprobe_handler(ip, parent_ip, fp, fregs, data); kprobe_busy_end(); - -recursion_unlock: - ftrace_test_recursion_unlock(bit); + return ret; } -static void fprobe_exit_handler(struct rethook_node *rh, void *data, - unsigned long ret_ip, struct pt_regs *regs) +static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs) { - struct fprobe *fp = (struct fprobe *)data; - struct fprobe_rethook_node *fpr; - int bit; + struct fprobe_hlist_node *node, *first; + unsigned long *fgraph_data = NULL; + unsigned long func = trace->func; + unsigned long ret_ip; + int reserved_words; + struct fprobe *fp; + int used, ret; - if (!fp || fprobe_disabled(fp)) - return; + if (WARN_ON_ONCE(!fregs)) + return 0; - fpr = container_of(rh, struct fprobe_rethook_node, node); + first = node = find_first_fprobe_node(func); + if (unlikely(!first)) + return 0; + + reserved_words = 0; + hlist_for_each_entry_from_rcu(node, hlist) { + if (node->addr != func) + break; + fp = READ_ONCE(node->fp); + if (!fp || !fp->exit_handler) + continue; + /* + * Since fprobe can be enabled until the next loop, we ignore the + * fprobe's disabled flag in this loop. + */ + reserved_words += + FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size); + } + node = first; + if (reserved_words) { + fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long)); + if (unlikely(!fgraph_data)) { + hlist_for_each_entry_from_rcu(node, hlist) { + if (node->addr != func) + break; + fp = READ_ONCE(node->fp); + if (fp && !fprobe_disabled(fp)) + fp->nmissed++; + } + return 0; + } + } /* - * we need to assure no calls to traceable functions in-between the - * end of fprobe_handler and the beginning of fprobe_exit_handler. + * TODO: recursion detection has been done in the fgraph. Thus we need + * to add a callback to increment missed counter. */ - bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip); - if (bit < 0) { - fp->nmissed++; + ret_ip = ftrace_regs_get_return_address(fregs); + used = 0; + hlist_for_each_entry_from_rcu(node, hlist) { + int data_size; + void *data; + + if (node->addr != func) + break; + fp = READ_ONCE(node->fp); + if (!fp || fprobe_disabled(fp)) + continue; + + data_size = fp->entry_data_size; + if (data_size && fp->exit_handler) + data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG; + else + data = NULL; + + if (fprobe_shared_with_kprobes(fp)) + ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data); + else + ret = __fprobe_handler(func, ret_ip, fp, fregs, data); + + /* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */ + if (!ret && fp->exit_handler) { + int size_words = SIZE_IN_LONG(data_size); + + if (write_fprobe_header(&fgraph_data[used], fp, size_words)) + used += FPROBE_HEADER_SIZE_IN_LONG + size_words; + } + } + if (used < reserved_words) + memset(fgraph_data + used, 0, reserved_words - used); + + /* If any exit_handler is set, data must be used. */ + return used != 0; +} +NOKPROBE_SYMBOL(fprobe_entry); + +static void fprobe_return(struct ftrace_graph_ret *trace, + struct fgraph_ops *gops, + struct ftrace_regs *fregs) +{ + unsigned long *fgraph_data = NULL; + unsigned long ret_ip; + struct fprobe *fp; + int size, curr; + int size_words; + + fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size); + if (WARN_ON_ONCE(!fgraph_data)) return; + size_words = SIZE_IN_LONG(size); + ret_ip = ftrace_regs_get_instruction_pointer(fregs); + + preempt_disable(); + + curr = 0; + while (size_words > curr) { + read_fprobe_header(&fgraph_data[curr], &fp, &size); + if (!fp) + break; + curr += FPROBE_HEADER_SIZE_IN_LONG; + if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) { + if (WARN_ON_ONCE(curr + size > size_words)) + break; + fp->exit_handler(fp, trace->func, ret_ip, fregs, + size ? fgraph_data + curr : NULL); + } + curr += size; } + preempt_enable(); +} +NOKPROBE_SYMBOL(fprobe_return); + +static struct fgraph_ops fprobe_graph_ops = { + .entryfunc = fprobe_entry, + .retfunc = fprobe_return, +}; +static int fprobe_graph_active; - fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs, - fp->entry_data_size ? (void *)fpr->data : NULL); - ftrace_test_recursion_unlock(bit); +/* Add @addrs to the ftrace filter and register fgraph if needed. */ +static int fprobe_graph_add_ips(unsigned long *addrs, int num) +{ + int ret; + + lockdep_assert_held(&fprobe_mutex); + + ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0); + if (ret) + return ret; + + if (!fprobe_graph_active) { + ret = register_ftrace_graph(&fprobe_graph_ops); + if (WARN_ON_ONCE(ret)) { + ftrace_free_filter(&fprobe_graph_ops.ops); + return ret; + } + } + fprobe_graph_active++; + return 0; +} + +/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */ +static void fprobe_graph_remove_ips(unsigned long *addrs, int num) +{ + lockdep_assert_held(&fprobe_mutex); + + fprobe_graph_active--; + if (!fprobe_graph_active) { + /* Q: should we unregister it ? */ + unregister_ftrace_graph(&fprobe_graph_ops); + return; + } + + ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0); } -NOKPROBE_SYMBOL(fprobe_exit_handler); static int symbols_cmp(const void *a, const void *b) { @@ -175,53 +440,97 @@ static unsigned long *get_ftrace_locations(const char **syms, int num) return ERR_PTR(-ENOENT); } -static void fprobe_init(struct fprobe *fp) -{ - fp->nmissed = 0; - if (fprobe_shared_with_kprobes(fp)) - fp->ops.func = fprobe_kprobe_handler; - else - fp->ops.func = fprobe_handler; - fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; -} +struct filter_match_data { + const char *filter; + const char *notfilter; + size_t index; + size_t size; + unsigned long *addrs; +}; -static int fprobe_init_rethook(struct fprobe *fp, int num) +static int filter_match_callback(void *data, const char *name, unsigned long addr) { - int size; + struct filter_match_data *match = data; - if (!fp->exit_handler) { - fp->rethook = NULL; + if (!glob_match(match->filter, name) || + (match->notfilter && glob_match(match->notfilter, name))) return 0; - } - /* Initialize rethook if needed */ - if (fp->nr_maxactive) - num = fp->nr_maxactive; - else - num *= num_possible_cpus() * 2; - if (num <= 0) - return -EINVAL; + if (!ftrace_location(addr)) + return 0; + + if (match->addrs) + match->addrs[match->index] = addr; - size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size; + match->index++; + return match->index == match->size; +} - /* Initialize rethook */ - fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num); - if (IS_ERR(fp->rethook)) - return PTR_ERR(fp->rethook); +/* + * Make IP list from the filter/no-filter glob patterns. + * Return the number of matched symbols, or -ENOENT. + */ +static int ip_list_from_filter(const char *filter, const char *notfilter, + unsigned long *addrs, size_t size) +{ + struct filter_match_data match = { .filter = filter, .notfilter = notfilter, + .index = 0, .size = size, .addrs = addrs}; + int ret; - return 0; + ret = kallsyms_on_each_symbol(filter_match_callback, &match); + if (ret < 0) + return ret; + ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match); + if (ret < 0) + return ret; + + return match.index ?: -ENOENT; } static void fprobe_fail_cleanup(struct fprobe *fp) { - if (!IS_ERR_OR_NULL(fp->rethook)) { - /* Don't need to cleanup rethook->handler because this is not used. */ - rethook_free(fp->rethook); - fp->rethook = NULL; + kfree(fp->hlist_array); + fp->hlist_array = NULL; +} + +/* Initialize the fprobe data structure. */ +static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) +{ + struct fprobe_hlist *hlist_array; + unsigned long addr; + int size, i; + + if (!fp || !addrs || num <= 0) + return -EINVAL; + + size = ALIGN(fp->entry_data_size, sizeof(long)); + if (size > MAX_FPROBE_DATA_SIZE) + return -E2BIG; + fp->entry_data_size = size; + + hlist_array = kzalloc(struct_size(hlist_array, array, num), GFP_KERNEL); + if (!hlist_array) + return -ENOMEM; + + fp->nmissed = 0; + + hlist_array->size = num; + fp->hlist_array = hlist_array; + hlist_array->fp = fp; + for (i = 0; i < num; i++) { + hlist_array->array[i].fp = fp; + addr = ftrace_location(addrs[i]); + if (!addr) { + fprobe_fail_cleanup(fp); + return -ENOENT; + } + hlist_array->array[i].addr = addr; } - ftrace_free_filter(&fp->ops); + return 0; } +#define FPROBE_IPS_MAX INT_MAX + /** * register_fprobe() - Register fprobe to ftrace by pattern. * @fp: A fprobe data structure to be registered. @@ -235,46 +544,24 @@ static void fprobe_fail_cleanup(struct fprobe *fp) */ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) { - struct ftrace_hash *hash; - unsigned char *str; - int ret, len; + unsigned long *addrs; + int ret; if (!fp || !filter) return -EINVAL; - fprobe_init(fp); - - len = strlen(filter); - str = kstrdup(filter, GFP_KERNEL); - ret = ftrace_set_filter(&fp->ops, str, len, 0); - kfree(str); - if (ret) + ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX); + if (ret < 0) return ret; - if (notfilter) { - len = strlen(notfilter); - str = kstrdup(notfilter, GFP_KERNEL); - ret = ftrace_set_notrace(&fp->ops, str, len, 0); - kfree(str); - if (ret) - goto out; - } - - /* TODO: - * correctly calculate the total number of filtered symbols - * from both filter and notfilter. - */ - hash = rcu_access_pointer(fp->ops.local_hash.filter_hash); - if (WARN_ON_ONCE(!hash)) - goto out; - - ret = fprobe_init_rethook(fp, (int)hash->count); - if (!ret) - ret = register_ftrace_function(&fp->ops); + addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + ret = ip_list_from_filter(filter, notfilter, addrs, ret); + if (ret > 0) + ret = register_fprobe_ips(fp, addrs, ret); -out: - if (ret) - fprobe_fail_cleanup(fp); + kfree(addrs); return ret; } EXPORT_SYMBOL_GPL(register_fprobe); @@ -282,7 +569,7 @@ EXPORT_SYMBOL_GPL(register_fprobe); /** * register_fprobe_ips() - Register fprobe to ftrace by address. * @fp: A fprobe data structure to be registered. - * @addrs: An array of target ftrace location addresses. + * @addrs: An array of target function address. * @num: The number of entries of @addrs. * * Register @fp to ftrace for enabling the probe on the address given by @addrs. @@ -294,23 +581,27 @@ EXPORT_SYMBOL_GPL(register_fprobe); */ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) { - int ret; - - if (!fp || !addrs || num <= 0) - return -EINVAL; - - fprobe_init(fp); + struct fprobe_hlist *hlist_array; + int ret, i; - ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0); + ret = fprobe_init(fp, addrs, num); if (ret) return ret; - ret = fprobe_init_rethook(fp, num); - if (!ret) - ret = register_ftrace_function(&fp->ops); + mutex_lock(&fprobe_mutex); + + hlist_array = fp->hlist_array; + ret = fprobe_graph_add_ips(addrs, num); + if (!ret) { + add_fprobe_hash(fp); + for (i = 0; i < hlist_array->size; i++) + insert_fprobe_node(&hlist_array->array[i]); + } + mutex_unlock(&fprobe_mutex); if (ret) fprobe_fail_cleanup(fp); + return ret; } EXPORT_SYMBOL_GPL(register_fprobe_ips); @@ -348,14 +639,13 @@ EXPORT_SYMBOL_GPL(register_fprobe_syms); bool fprobe_is_registered(struct fprobe *fp) { - if (!fp || (fp->ops.saved_func != fprobe_handler && - fp->ops.saved_func != fprobe_kprobe_handler)) + if (!fp || !fp->hlist_array) return false; return true; } /** - * unregister_fprobe() - Unregister fprobe from ftrace + * unregister_fprobe() - Unregister fprobe. * @fp: A fprobe data structure to be unregistered. * * Unregister fprobe (and remove ftrace hooks from the function entries). @@ -364,23 +654,41 @@ bool fprobe_is_registered(struct fprobe *fp) */ int unregister_fprobe(struct fprobe *fp) { - int ret; + struct fprobe_hlist *hlist_array; + unsigned long *addrs = NULL; + int ret = 0, i, count; - if (!fprobe_is_registered(fp)) - return -EINVAL; + mutex_lock(&fprobe_mutex); + if (!fp || !is_fprobe_still_exist(fp)) { + ret = -EINVAL; + goto out; + } + + hlist_array = fp->hlist_array; + addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL); + if (!addrs) { + ret = -ENOMEM; /* TODO: Fallback to one-by-one loop */ + goto out; + } - if (!IS_ERR_OR_NULL(fp->rethook)) - rethook_stop(fp->rethook); + /* Remove non-synonim ips from table and hash */ + count = 0; + for (i = 0; i < hlist_array->size; i++) { + if (!delete_fprobe_node(&hlist_array->array[i])) + addrs[count++] = hlist_array->array[i].addr; + } + del_fprobe_hash(fp); - ret = unregister_ftrace_function(&fp->ops); - if (ret < 0) - return ret; + if (count) + fprobe_graph_remove_ips(addrs, count); - if (!IS_ERR_OR_NULL(fp->rethook)) - rethook_free(fp->rethook); + kfree_rcu(hlist_array, rcu); + fp->hlist_array = NULL; - ftrace_free_filter(&fp->ops); +out: + mutex_unlock(&fprobe_mutex); + kfree(addrs); return ret; } EXPORT_SYMBOL_GPL(unregister_fprobe); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2e113f8b13a2..b2955e504193 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -536,24 +536,21 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; - int ret = 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER static struct trace_seq s; unsigned long long avg; unsigned long long stddev; #endif - mutex_lock(&ftrace_profile_lock); + guard(mutex)(&ftrace_profile_lock); /* we raced with function_profile_reset() */ - if (unlikely(rec->counter == 0)) { - ret = -EBUSY; - goto out; - } + if (unlikely(rec->counter == 0)) + return -EBUSY; #ifdef CONFIG_FUNCTION_GRAPH_TRACER avg = div64_ul(rec->time, rec->counter); if (tracing_thresh && (avg < tracing_thresh)) - goto out; + return 0; #endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -590,10 +587,8 @@ static int function_stat_show(struct seq_file *m, void *v) trace_print_seq(m, &s); #endif seq_putc(m, '\n'); -out: - mutex_unlock(&ftrace_profile_lock); - return ret; + return 0; } static void ftrace_profile_reset(struct ftrace_profile_stat *stat) @@ -789,27 +784,24 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; - unsigned long flags; if (!ftrace_profile_enabled) return; - local_irq_save(flags); + guard(preempt_notrace)(); stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) - goto out; + return; rec = ftrace_find_profiled_func(stat, ip); if (!rec) { rec = ftrace_profile_alloc(stat, ip); if (!rec) - goto out; + return; } rec->counter++; - out: - local_irq_restore(flags); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -827,7 +819,8 @@ struct profile_fgraph_data { }; static int profile_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct profile_fgraph_data *profile_data; @@ -849,26 +842,27 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace, } static void profile_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct profile_fgraph_data *profile_data; struct ftrace_profile_stat *stat; unsigned long long calltime; unsigned long long rettime = trace_clock_local(); struct ftrace_profile *rec; - unsigned long flags; int size; - local_irq_save(flags); + guard(preempt_notrace)(); + stat = this_cpu_ptr(&ftrace_profile_stats); if (!stat->hash || !ftrace_profile_enabled) - goto out; + return; profile_data = fgraph_retrieve_data(gops->idx, &size); /* If the calltime was zero'd ignore it */ if (!profile_data || !profile_data->calltime) - goto out; + return; calltime = rettime - profile_data->calltime; @@ -896,9 +890,6 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, rec->time += calltime; rec->time_squared += calltime * calltime; } - - out: - local_irq_restore(flags); } static struct fgraph_ops fprofiler_ops = { @@ -946,20 +937,16 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, val = !!val; - mutex_lock(&ftrace_profile_lock); + guard(mutex)(&ftrace_profile_lock); if (ftrace_profile_enabled ^ val) { if (val) { ret = ftrace_profile_init(); - if (ret < 0) { - cnt = ret; - goto out; - } + if (ret < 0) + return ret; ret = register_ftrace_profiler(); - if (ret < 0) { - cnt = ret; - goto out; - } + if (ret < 0) + return ret; ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; @@ -970,8 +957,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, unregister_ftrace_profiler(); } } - out: - mutex_unlock(&ftrace_profile_lock); *ppos += cnt; @@ -1671,14 +1656,12 @@ unsigned long ftrace_location(unsigned long ip) loc = ftrace_location_range(ip, ip); if (!loc) { if (!kallsyms_lookup_size_offset(ip, &size, &offset)) - goto out; + return 0; /* map sym+0 to __fentry__ */ if (!offset) loc = ftrace_location_range(ip, ip + size - 1); } - -out: return loc; } @@ -2073,7 +2056,7 @@ rollback: continue; if (rec == end) - goto err_out; + return -EBUSY; in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); @@ -2086,7 +2069,6 @@ rollback: rec->flags |= FTRACE_FL_IPMODIFY; } while_for_each_ftrace_rec(); -err_out: return -EBUSY; } @@ -4982,10 +4964,6 @@ static int cache_mod(struct trace_array *tr, return ftrace_add_mod(tr, func, module, enable); } -static int -ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, - int reset, int enable); - #ifdef CONFIG_MODULES static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, char *mod, bool enable) @@ -5615,20 +5593,15 @@ static DEFINE_MUTEX(ftrace_cmd_mutex); __init int register_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p; - int ret = 0; - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(cmd->name, p->name) == 0) { - ret = -EBUSY; - goto out_unlock; - } + if (strcmp(cmd->name, p->name) == 0) + return -EBUSY; } list_add(&cmd->list, &ftrace_commands); - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return 0; } /* @@ -5638,20 +5611,17 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd) __init int unregister_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p, *n; - int ret = -ENODEV; - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); + list_for_each_entry_safe(p, n, &ftrace_commands, list) { if (strcmp(cmd->name, p->name) == 0) { - ret = 0; list_del_init(&p->list); - goto out_unlock; + return 0; } } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return -ENODEV; } static int ftrace_process_regex(struct ftrace_iterator *iter, @@ -5661,7 +5631,7 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, struct trace_array *tr = iter->ops->private; char *func, *command, *next = buff; struct ftrace_func_command *p; - int ret = -EINVAL; + int ret; func = strsep(&next, ":"); @@ -5678,17 +5648,14 @@ static int ftrace_process_regex(struct ftrace_iterator *iter, command = strsep(&next, ":"); - mutex_lock(&ftrace_cmd_mutex); + guard(mutex)(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { - if (strcmp(p->name, command) == 0) { - ret = p->func(tr, hash, func, command, next, enable); - goto out_unlock; - } + if (strcmp(p->name, command) == 0) + return p->func(tr, hash, func, command, next, enable); } - out_unlock: - mutex_unlock(&ftrace_cmd_mutex); - return ret; + return -EINVAL; } static ssize_t @@ -5722,12 +5689,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, parser->idx, enable); trace_parser_clear(parser); if (ret < 0) - goto out; + return ret; } - ret = read; - out: - return ret; + return read; } ssize_t @@ -5788,7 +5753,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips, static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long *ips, unsigned int cnt, - int remove, int reset, int enable) + int remove, int reset, int enable, char *mod) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -5814,7 +5779,15 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, goto out_regex_unlock; } - if (buf && !ftrace_match_records(hash, buf, len)) { + if (buf && !match_records(hash, buf, len, mod)) { + /* If this was for a module and nothing was enabled, flag it */ + if (mod) + (*orig_hash)->flags |= FTRACE_HASH_FL_MOD; + + /* + * Even if it is a mod, return error to let caller know + * nothing was added + */ ret = -EINVAL; goto out_regex_unlock; } @@ -5839,7 +5812,7 @@ static int ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, int remove, int reset, int enable) { - return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable, NULL); } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -6217,7 +6190,38 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) { - return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable); + char *mod = NULL, *func, *command, *next = buf; + char *tmp __free(kfree) = NULL; + struct trace_array *tr = ops->private; + int ret; + + func = strsep(&next, ":"); + + /* This can also handle :mod: parsing */ + if (next) { + if (!tr) + return -EINVAL; + + command = strsep(&next, ":"); + if (strcmp(command, "mod") != 0) + return -EINVAL; + + mod = next; + len = command - func; + /* Save the original func as ftrace_set_hash() can modify it */ + tmp = kstrdup(func, GFP_KERNEL); + } + + ret = ftrace_set_hash(ops, func, len, NULL, 0, 0, reset, enable, mod); + + if (tr && mod && ret < 0) { + /* Did tmp fail to allocate? */ + if (!tmp) + return -ENOMEM; + ret = cache_mod(tr, tmp, mod, enable); + } + + return ret; } /** @@ -6381,6 +6385,14 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable) ftrace_ops_init(ops); + /* The trace_array is needed for caching module function filters */ + if (!ops->private) { + struct trace_array *tr = trace_get_global_array(); + + ops->private = tr; + ftrace_init_trace_array(tr); + } + while (buf) { func = strsep(&buf, ","); ftrace_set_regex(ops, func, strlen(func), 0, enable); @@ -7814,9 +7826,14 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) void ftrace_init_trace_array(struct trace_array *tr) { + if (tr->flags & TRACE_ARRAY_FL_MOD_INIT) + return; + INIT_LIST_HEAD(&tr->func_probes); INIT_LIST_HEAD(&tr->mod_trace); INIT_LIST_HEAD(&tr->mod_notrace); + + tr->flags |= TRACE_ARRAY_FL_MOD_INIT; } #else @@ -7845,7 +7862,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) __init void ftrace_init_global_array_ops(struct trace_array *tr) { tr->ops = &global_ops; - tr->ops->private = tr; + if (!global_ops.private) + global_ops.private = tr; ftrace_init_trace_array(tr); init_array_fgraph_ops(tr, tr->ops); } @@ -8287,7 +8305,7 @@ pid_write(struct file *filp, const char __user *ubuf, if (!cnt) return 0; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); switch (type) { case TRACE_PIDS: @@ -8303,14 +8321,13 @@ pid_write(struct file *filp, const char __user *ubuf, lockdep_is_held(&ftrace_lock)); break; default: - ret = -EINVAL; WARN_ON_ONCE(1); - goto out; + return -EINVAL; } ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt); if (ret < 0) - goto out; + return ret; switch (type) { case TRACE_PIDS: @@ -8339,11 +8356,8 @@ pid_write(struct file *filp, const char __user *ubuf, ftrace_update_pid_func(); ftrace_startup_all(0); - out: - mutex_unlock(&ftrace_lock); - if (ret > 0) - *ppos += ret; + *ppos += ret; return ret; } @@ -8746,17 +8760,17 @@ static int ftrace_enable_sysctl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret = -ENODEV; + int ret; - mutex_lock(&ftrace_lock); + guard(mutex)(&ftrace_lock); if (unlikely(ftrace_disabled)) - goto out; + return -ENODEV; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) - goto out; + return ret; if (ftrace_enabled) { @@ -8770,8 +8784,7 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write, } else { if (is_permanent_ops_registered()) { ftrace_enabled = true; - ret = -EBUSY; - goto out; + return -EBUSY; } /* stopping ftrace calls (just send to ftrace_stub) */ @@ -8781,9 +8794,7 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write, } last_ftrace_enabled = !!ftrace_enabled; - out: - mutex_unlock(&ftrace_lock); - return ret; + return 0; } static struct ctl_table ftrace_sysctls[] = { diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 4966e6bbdf6f..c62b9b3cfb3d 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) int i; /* According to linux/thread.h, pids can be no bigger that 30 bits */ - WARN_ON_ONCE(pid_max > (1 << 30)); + WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30)); pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); if (!pid_list) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 60210fb5b211..6d61ff78926b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4682,40 +4682,22 @@ int ring_buffer_write(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_write); -static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) { - struct buffer_page *reader = cpu_buffer->reader_page; - struct buffer_page *head = rb_set_head_page(cpu_buffer); - struct buffer_page *commit = cpu_buffer->commit_page; - - /* In case of error, head will be NULL */ - if (unlikely(!head)) - return true; - - /* Reader should exhaust content in reader page */ - if (reader->read != rb_page_size(reader)) - return false; - - /* - * If writers are committing on the reader page, knowing all - * committed content has been read, the ring buffer is empty. - */ - if (commit == reader) - return true; - - /* - * If writers are committing on a page other than reader page - * and head page, there should always be content to read. - */ - if (commit != head) - return false; + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} - /* - * Writers are committing on the head page, we just need - * to care about there're committed data, and the reader will - * swap reader page with head page when it is to read data. - */ - return rb_page_commit(commit) == 0; +static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + return !rb_num_of_entries(cpu_buffer); } /** @@ -4861,19 +4843,6 @@ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); -/* - * The total entries in the ring buffer is the running counter - * of entries entered into the ring buffer, minus the sum of - * the entries read from the ring buffer and the number of - * entries that were overwritten. - */ -static inline unsigned long -rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) -{ - return local_read(&cpu_buffer->entries) - - (local_read(&cpu_buffer->overrun) + cpu_buffer->read); -} - /** * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer * @buffer: The ring buffer @@ -7059,7 +7028,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, } while (p < nr_pages) { - struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + struct page *page; int off = 0; if (WARN_ON_ONCE(s >= nr_subbufs)) { @@ -7067,6 +7036,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, goto out; } + page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + for (; off < (1 << (subbuf_order)); off++, page++) { if (p >= nr_pages) break; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8aebcb01e62..2542ec398b5d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4122,6 +4122,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) preempt_model_none() ? "server" : preempt_model_voluntary() ? "desktop" : preempt_model_full() ? "preempt" : + preempt_model_lazy() ? "lazy" : preempt_model_rt() ? "preempt_rt" : "unknown", /* These are reserved for later use */ @@ -10660,6 +10661,14 @@ out: return ret; } +#ifdef CONFIG_FUNCTION_TRACER +/* Used to set module cached ftrace filtering at boot up */ +__init struct trace_array *trace_get_global_array(void) +{ + return &global_trace; +} +#endif + void __init ftrace_boot_snapshot(void) { #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9691b47b5f3d..04058a9889b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -432,6 +432,7 @@ struct trace_array { enum { TRACE_ARRAY_FL_GLOBAL = BIT(0), TRACE_ARRAY_FL_BOOT = BIT(1), + TRACE_ARRAY_FL_MOD_INIT = BIT(2), }; extern struct list_head ftrace_trace_arrays; @@ -693,8 +694,10 @@ void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); -void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops); -int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops); +void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); +int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops, + struct ftrace_regs *fregs); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -717,8 +720,6 @@ extern unsigned long tracing_thresh; /* PID filtering */ -extern int pid_max; - bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, @@ -1114,6 +1115,7 @@ void ftrace_destroy_function_files(struct trace_array *tr); int ftrace_allocate_ftrace_ops(struct trace_array *tr); void ftrace_free_ftrace_ops(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); +struct trace_array *trace_get_global_array(void); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index c62d1629cffe..b8f3c4ba309b 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -134,7 +134,7 @@ static int process_fetch_insn(struct fetch_insn *code, void *rec, void *edata, void *dest, void *base) { - struct pt_regs *regs = rec; + struct ftrace_regs *fregs = rec; unsigned long val; int ret; @@ -142,17 +142,17 @@ retry: /* 1st stage: get value from context */ switch (code->op) { case FETCH_OP_STACK: - val = regs_get_kernel_stack_nth(regs, code->param); + val = ftrace_regs_get_kernel_stack_nth(fregs, code->param); break; case FETCH_OP_STACKP: - val = kernel_stack_pointer(regs); + val = ftrace_regs_get_stack_pointer(fregs); break; case FETCH_OP_RETVAL: - val = regs_return_value(regs); + val = ftrace_regs_get_return_value(fregs); break; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API case FETCH_OP_ARG: - val = regs_get_kernel_argument(regs, code->param); + val = ftrace_regs_get_argument(fregs, code->param); break; case FETCH_OP_EDATA: val = *(unsigned long *)((unsigned long)edata + code->offset); @@ -175,7 +175,7 @@ NOKPROBE_SYMBOL(process_fetch_insn) /* function entry handler */ static nokprobe_inline void __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs, + struct ftrace_regs *fregs, struct trace_event_file *trace_file) { struct fentry_trace_entry_head *entry; @@ -189,41 +189,71 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tf->tp, regs, NULL); + dsize = __get_data_size(&tf->tp, fregs, NULL); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tf->tp.size + dsize); if (!entry) return; - fbuffer.regs = regs; + fbuffer.regs = ftrace_get_regs(fregs); entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->ip = entry_ip; - store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } static void fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs) + struct ftrace_regs *fregs) { struct event_file_link *link; trace_probe_for_each_link_rcu(link, &tf->tp) - __fentry_trace_func(tf, entry_ip, regs, link->file); + __fentry_trace_func(tf, entry_ip, fregs, link->file); } NOKPROBE_SYMBOL(fentry_trace_func); +static nokprobe_inline +void store_fprobe_entry_data(void *edata, struct trace_probe *tp, struct ftrace_regs *fregs) +{ + struct probe_entry_arg *earg = tp->entry_arg; + unsigned long val = 0; + int i; + + if (!earg) + return; + + for (i = 0; i < earg->size; i++) { + struct fetch_insn *code = &earg->code[i]; + + switch (code->op) { + case FETCH_OP_ARG: + val = ftrace_regs_get_argument(fregs, code->param); + break; + case FETCH_OP_ST_EDATA: + *(unsigned long *)((unsigned long)edata + code->offset) = val; + break; + case FETCH_OP_END: + goto end; + default: + break; + } + } +end: + return; +} + /* function exit handler */ static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); if (tf->tp.entry_arg) - store_trace_entry_data(entry_data, &tf->tp, regs); + store_fprobe_entry_data(entry_data, &tf->tp, fregs); return 0; } @@ -231,7 +261,7 @@ NOKPROBE_SYMBOL(trace_fprobe_entry_handler) static nokprobe_inline void __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data, struct trace_event_file *trace_file) { struct fexit_trace_entry_head *entry; @@ -245,60 +275,63 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, if (trace_trigger_soft_disabled(trace_file)) return; - dsize = __get_data_size(&tf->tp, regs, entry_data); + dsize = __get_data_size(&tf->tp, fregs, entry_data); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + tf->tp.size + dsize); if (!entry) return; - fbuffer.regs = regs; + fbuffer.regs = ftrace_get_regs(fregs); entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); entry->func = entry_ip; entry->ret_ip = ret_ip; - store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize); trace_event_buffer_commit(&fbuffer); } static void fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, void *entry_data) + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct event_file_link *link; trace_probe_for_each_link_rcu(link, &tf->tp) - __fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data, link->file); + __fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data, link->file); } NOKPROBE_SYMBOL(fexit_trace_func); #ifdef CONFIG_PERF_EVENTS static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, - struct pt_regs *regs) + struct ftrace_regs *fregs) { struct trace_event_call *call = trace_probe_event_call(&tf->tp); struct fentry_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; + struct pt_regs *regs; int rctx; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return 0; - dsize = __get_data_size(&tf->tp, regs, NULL); + dsize = __get_data_size(&tf->tp, fregs, NULL); __size = sizeof(*entry) + tf->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_alloc(size, NULL, &rctx); + entry = perf_trace_buf_alloc(size, ®s, &rctx); if (!entry) return 0; + regs = ftrace_fill_perf_regs(fregs, regs); + entry->ip = entry_ip; memset(&entry[1], 0, dsize); - store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); return 0; @@ -307,31 +340,34 @@ NOKPROBE_SYMBOL(fentry_perf_func); static void fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_event_call *call = trace_probe_event_call(&tf->tp); struct fexit_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; + struct pt_regs *regs; int rctx; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; - dsize = __get_data_size(&tf->tp, regs, entry_data); + dsize = __get_data_size(&tf->tp, fregs, entry_data); __size = sizeof(*entry) + tf->tp.size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - entry = perf_trace_buf_alloc(size, NULL, &rctx); + entry = perf_trace_buf_alloc(size, ®s, &rctx); if (!entry) return; + regs = ftrace_fill_perf_regs(fregs, regs); + entry->func = entry_ip; entry->ret_ip = ret_ip; - store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize); + store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); } @@ -339,33 +375,34 @@ NOKPROBE_SYMBOL(fexit_perf_func); #endif /* CONFIG_PERF_EVENTS */ static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); int ret = 0; if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) - fentry_trace_func(tf, entry_ip, regs); + fentry_trace_func(tf, entry_ip, fregs); + #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) - ret = fentry_perf_func(tf, entry_ip, regs); + ret = fentry_perf_func(tf, entry_ip, fregs); #endif return ret; } NOKPROBE_SYMBOL(fentry_dispatcher); static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data) { struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp); if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE)) - fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data); + fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE)) - fexit_perf_func(tf, entry_ip, ret_ip, regs, entry_data); + fexit_perf_func(tf, entry_ip, ret_ip, fregs, entry_data); #endif } NOKPROBE_SYMBOL(fexit_dispatcher); @@ -379,6 +416,9 @@ static void free_trace_fprobe(struct trace_fprobe *tf) } } +/* Since alloc_trace_fprobe() can return error, check the pointer is ERR too. */ +DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) free_trace_fprobe(_T)) + /* * Allocate new trace_probe and initialize it (including fprobe). */ @@ -387,10 +427,9 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *symbol, struct tracepoint *tpoint, struct module *mod, - int maxactive, int nargs, bool is_return) { - struct trace_fprobe *tf; + struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; int ret = -ENOMEM; tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL); @@ -399,7 +438,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, tf->symbol = kstrdup(symbol, GFP_KERNEL); if (!tf->symbol) - goto error; + return ERR_PTR(-ENOMEM); if (is_return) tf->fp.exit_handler = fexit_dispatcher; @@ -408,17 +447,13 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, tf->tpoint = tpoint; tf->mod = mod; - tf->fp.nr_maxactive = maxactive; ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) - goto error; + return ERR_PTR(ret); dyn_event_init(&tf->devent, &trace_fprobe_ops); - return tf; -error: - free_trace_fprobe(tf); - return ERR_PTR(ret); + return_ptr(tf); } static struct trace_fprobe *find_trace_fprobe(const char *event, @@ -845,14 +880,12 @@ static int register_trace_fprobe(struct trace_fprobe *tf) struct trace_fprobe *old_tf; int ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), trace_probe_group_name(&tf->tp)); - if (old_tf) { - ret = append_trace_fprobe(tf, old_tf); - goto end; - } + if (old_tf) + return append_trace_fprobe(tf, old_tf); /* Register new event */ ret = register_fprobe_event(tf); @@ -862,7 +895,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf) trace_probe_log_err(0, EVENT_EXIST); } else pr_warn("Failed to register probe event(%d)\n", ret); - goto end; + return ret; } /* Register fprobe */ @@ -872,8 +905,6 @@ static int register_trace_fprobe(struct trace_fprobe *tf) else dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp)); -end: - mutex_unlock(&event_mutex); return ret; } @@ -1034,7 +1065,10 @@ static int parse_symbol_and_return(int argc, const char *argv[], return 0; } -static int __trace_fprobe_create(int argc, const char *argv[]) +DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T)) + +static int trace_fprobe_create_internal(int argc, const char *argv[], + struct traceprobe_parse_context *ctx) { /* * Argument syntax: @@ -1060,24 +1094,20 @@ static int __trace_fprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_fprobe *tf = NULL; - int i, len, new_argc = 0, ret = 0; + struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; + int i, new_argc = 0, ret = 0; bool is_return = false; - char *symbol = NULL; + char *symbol __free(kfree) = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; - const char **new_argv = NULL; - int maxactive = 0; + const char **new_argv __free(kfree) = NULL; char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char sbuf[KSYM_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; - char *dbuf = NULL; + char *dbuf __free(kfree) = NULL; bool is_tracepoint = false; - struct module *tp_mod = NULL; + struct module *tp_mod __free(module_put) = NULL; struct tracepoint *tpoint = NULL; - struct traceprobe_parse_context ctx = { - .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, - }; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -1087,35 +1117,13 @@ static int __trace_fprobe_create(int argc, const char *argv[]) group = TRACEPOINT_EVENT_SYSTEM; } - trace_probe_log_init("trace_fprobe", argc, argv); - - event = strchr(&argv[0][1], ':'); - if (event) - event++; - - if (isdigit(argv[0][1])) { - if (event) - len = event - &argv[0][1] - 1; - else - len = strlen(&argv[0][1]); - if (len > MAX_EVENT_NAME_LEN - 1) { - trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; - } - memcpy(buf, &argv[0][1], len); - buf[len] = '\0'; - ret = kstrtouint(buf, 0, &maxactive); - if (ret || !maxactive) { + if (argv[0][1] != '\0') { + if (argv[0][1] != ':') { + trace_probe_log_set_index(0); trace_probe_log_err(1, BAD_MAXACT); - goto parse_error; - } - /* fprobe rethook instances are iterated over via a list. The - * maximum should stay reasonable. - */ - if (maxactive > RETHOOK_MAXACTIVE_MAX) { - trace_probe_log_err(1, MAXACT_TOO_BIG); - goto parse_error; + return -EINVAL; } + event = &argv[0][2]; } trace_probe_log_set_index(1); @@ -1123,20 +1131,14 @@ static int __trace_fprobe_create(int argc, const char *argv[]) /* a symbol(or tracepoint) must be specified */ ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint); if (ret < 0) - goto parse_error; - - if (!is_return && maxactive) { - trace_probe_log_set_index(0); - trace_probe_log_err(1, BAD_MAXACT_TYPE); - goto parse_error; - } + return -EINVAL; trace_probe_log_set_index(0); if (event) { ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) - goto parse_error; + return -EINVAL; } if (!event) { @@ -1152,67 +1154,62 @@ static int __trace_fprobe_create(int argc, const char *argv[]) } if (is_return) - ctx.flags |= TPARG_FL_RETURN; + ctx->flags |= TPARG_FL_RETURN; else - ctx.flags |= TPARG_FL_FENTRY; + ctx->flags |= TPARG_FL_FENTRY; if (is_tracepoint) { - ctx.flags |= TPARG_FL_TPOINT; + ctx->flags |= TPARG_FL_TPOINT; tpoint = find_tracepoint(symbol, &tp_mod); if (tpoint) { - ctx.funcname = kallsyms_lookup( + ctx->funcname = kallsyms_lookup( (unsigned long)tpoint->probestub, NULL, NULL, NULL, sbuf); } else if (IS_ENABLED(CONFIG_MODULES)) { /* This *may* be loaded afterwards */ tpoint = TRACEPOINT_STUB; - ctx.funcname = symbol; + ctx->funcname = symbol; } else { trace_probe_log_set_index(1); trace_probe_log_err(0, NO_TRACEPOINT); - goto parse_error; + return -EINVAL; } } else - ctx.funcname = symbol; + ctx->funcname = symbol; argc -= 2; argv += 2; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, - abuf, MAX_BTF_ARGS_LEN, &ctx); - if (IS_ERR(new_argv)) { - ret = PTR_ERR(new_argv); - new_argv = NULL; - goto out; - } + abuf, MAX_BTF_ARGS_LEN, ctx); + if (IS_ERR(new_argv)) + return PTR_ERR(new_argv); if (new_argv) { argc = new_argc; argv = new_argv; } - if (argc > MAX_TRACE_ARGS) { - ret = -E2BIG; - goto out; - } + if (argc > MAX_TRACE_ARGS) + return -E2BIG; ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); if (ret) - goto out; + return ret; /* setup a probe */ tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod, - maxactive, argc, is_return); + argc, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); - goto out; /* We know tf is not allocated */ + return ret; } /* parse arguments */ for (i = 0; i < argc; i++) { trace_probe_log_set_index(i + 2); - ctx.offset = 0; - ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx); + ctx->offset = 0; + ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], ctx); if (ret) - goto error; /* This can be -ENOMEM */ + return ret; /* This can be -ENOMEM */ } if (is_return && tf->tp.entry_arg) { @@ -1223,7 +1220,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) ret = traceprobe_set_print_fmt(&tf->tp, is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL); if (ret < 0) - goto error; + return ret; ret = register_trace_fprobe(tf); if (ret) { @@ -1234,29 +1231,32 @@ static int __trace_fprobe_create(int argc, const char *argv[]) trace_probe_log_err(0, BAD_PROBE_ADDR); else if (ret != -ENOMEM && ret != -EEXIST) trace_probe_log_err(0, FAIL_REG_PROBE); - goto error; + return -EINVAL; } -out: - if (tp_mod) - module_put(tp_mod); + /* 'tf' is successfully registered. To avoid freeing, assign NULL. */ + tf = NULL; + + return 0; +} + +static int trace_fprobe_create_cb(int argc, const char *argv[]) +{ + struct traceprobe_parse_context ctx = { + .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, + }; + int ret; + + trace_probe_log_init("trace_fprobe", argc, argv); + ret = trace_fprobe_create_internal(argc, argv, &ctx); traceprobe_finish_parse(&ctx); trace_probe_log_clear(); - kfree(new_argv); - kfree(symbol); - kfree(dbuf); return ret; - -parse_error: - ret = -EINVAL; -error: - free_trace_fprobe(tf); - goto out; } static int trace_fprobe_create(const char *raw_command) { - return trace_probe_create(raw_command, __trace_fprobe_create); + return trace_probe_create(raw_command, trace_fprobe_create_cb); } static int trace_fprobe_release(struct dyn_event *ev) @@ -1278,8 +1278,6 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) seq_putc(m, 't'); else seq_putc(m, 'f'); - if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive) - seq_printf(m, "%d", tf->fp.nr_maxactive); seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp), trace_probe_name(&tf->tp)); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 5504b5e4e7b4..dc62eb93837a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -175,16 +175,16 @@ struct fgraph_times { }; int trace_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; struct trace_array_cpu *data; struct fgraph_times *ftimes; - unsigned long flags; unsigned int trace_ctx; long disabled; - int ret; + int ret = 0; int cpu; if (*task_var & TRACE_GRAPH_NOTRACE) @@ -235,25 +235,21 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, if (tracing_thresh) return 1; - local_irq_save(flags); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - trace_ctx = tracing_gen_ctx_flags(flags); - if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && - tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR))) { + disabled = atomic_read(&data->disabled); + if (likely(!disabled)) { + trace_ctx = tracing_gen_ctx(); + if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) && + tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) { unsigned long retaddr = ftrace_graph_top_ret_addr(current); - ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr); - } else + } else { ret = __trace_graph_entry(tr, trace, trace_ctx); - } else { - ret = 0; + } } - - atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); return ret; } @@ -314,13 +310,12 @@ static void handle_nosleeptime(struct ftrace_graph_ret *trace, } void trace_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, struct ftrace_regs *fregs) { unsigned long *task_var = fgraph_get_task_var(gops); struct trace_array *tr = gops->private; struct trace_array_cpu *data; struct fgraph_times *ftimes; - unsigned long flags; unsigned int trace_ctx; long disabled; int size; @@ -341,20 +336,20 @@ void trace_graph_return(struct ftrace_graph_ret *trace, trace->calltime = ftimes->calltime; - local_irq_save(flags); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) { - trace_ctx = tracing_gen_ctx_flags(flags); + disabled = atomic_read(&data->disabled); + if (likely(!disabled)) { + trace_ctx = tracing_gen_ctx(); __trace_graph_return(tr, trace, trace_ctx); } - atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); } static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct fgraph_times *ftimes; int size; @@ -378,7 +373,7 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace, (trace->rettime - ftimes->calltime < tracing_thresh)) return; else - trace_graph_return(trace, gops); + trace_graph_return(trace, gops, fregs); } static struct fgraph_ops funcgraph_ops = { diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index fce064e20570..08786c59d397 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -176,12 +176,14 @@ static int irqsoff_display_graph(struct trace_array *tr, int set) } static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; unsigned int trace_ctx; + u64 *calltime; int ret; if (ftrace_graph_ignore_func(gops, trace)) @@ -199,6 +201,12 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, if (!func_prolog_dec(tr, &data, &flags)) return 0; + calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); + if (!calltime) + return 0; + + *calltime = trace_clock_local(); + trace_ctx = tracing_gen_ctx_flags(flags); ret = __trace_graph_entry(tr, trace, trace_ctx); atomic_dec(&data->disabled); @@ -207,18 +215,26 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace, } static void irqsoff_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; unsigned int trace_ctx; + u64 *calltime; + int size; ftrace_graph_addr_finish(gops, trace); if (!func_prolog_dec(tr, &data, &flags)) return; + calltime = fgraph_retrieve_data(gops->idx, &size); + if (!calltime) + return; + trace->calltime = *calltime; + trace_ctx = tracing_gen_ctx_flags(flags); __trace_graph_return(tr, trace, trace_ctx); atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 2caf0d2afb32..f39b37fcdb3b 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -232,7 +232,7 @@ array: /* Sum up total data length for dynamic arrays (strings) */ static nokprobe_inline int -__get_data_size(struct trace_probe *tp, struct pt_regs *regs, void *edata) +__get_data_size(struct trace_probe *tp, void *regs, void *edata) { struct probe_arg *arg; int i, len, ret = 0; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 573b5d8e8a28..cb49f7279dc8 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void) if (tgid_map) return 0; - tgid_map_max = pid_max; + tgid_map_max = init_pid_ns.pid_max; map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), GFP_KERNEL); if (!map) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index d6c7f18daa15..f372252dc8bb 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -113,11 +113,13 @@ static int wakeup_display_graph(struct trace_array *tr, int set) } static int wakeup_graph_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + u64 *calltime; int ret = 0; if (ftrace_graph_ignore_func(gops, trace)) @@ -135,6 +137,12 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return 0; + calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime)); + if (!calltime) + return 0; + + *calltime = trace_clock_local(); + ret = __trace_graph_entry(tr, trace, trace_ctx); atomic_dec(&data->disabled); preempt_enable_notrace(); @@ -143,17 +151,25 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace, } static void wakeup_graph_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned int trace_ctx; + u64 *calltime; + int size; ftrace_graph_addr_finish(gops, trace); if (!func_prolog_preempt_disable(tr, &data, &trace_ctx)) return; + calltime = fgraph_retrieve_data(gops->idx, &size); + if (!calltime) + return; + trace->calltime = *calltime; + __trace_graph_return(tr, trace, trace_ctx); atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 38b5754790c9..d88c44f1dfa5 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -774,7 +774,8 @@ struct fgraph_fixture { }; static __init int store_entry(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops); const char *type = fixture->store_type_name; @@ -807,7 +808,8 @@ static __init int store_entry(struct ftrace_graph_ent *trace, } static __init void store_return(struct ftrace_graph_ret *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops); const char *type = fixture->store_type_name; @@ -1025,7 +1027,8 @@ static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops) + struct fgraph_ops *gops, + struct ftrace_regs *fregs) { /* This is harmlessly racy, we want to approximately detect a hang */ if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { @@ -1039,7 +1042,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace, return 0; } - return trace_graph_entry(trace, gops); + return trace_graph_entry(trace, gops, fregs); } static struct fgraph_ops fgraph_ops __initdata = { diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 1895fbc32bcb..5267adeaa403 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -71,7 +71,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, bit /= WATCH_QUEUE_NOTE_SIZE; page = buf->page; - bit += page->index; + bit += page->private; set_bit(bit, wqueue->notes_bitmap); generic_pipe_buf_release(pipe, buf); @@ -278,7 +278,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) goto error_p; - pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; + pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE; } bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9362484a653c..33a23c7b2274 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7840,7 +7840,7 @@ static void __init wq_cpu_intensive_thresh_init(void) unsigned long thresh; unsigned long bogo; - pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release"); BUG_ON(IS_ERR(pwq_release_worker)); /* if the user set it to a specific value, keep it */ |