diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup/cgroup-internal.h | 7 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 20 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 40 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 11 | ||||
-rw-r--r-- | kernel/cgroup/namespace.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 4 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 27 | ||||
-rw-r--r-- | kernel/sched/sched.h | 9 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 35 | ||||
-rw-r--r-- | kernel/workqueue.c | 5 |
10 files changed, 90 insertions, 70 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 9203bfb05603..00f4d6bf048f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -5,6 +5,7 @@ #include <linux/kernfs.h> #include <linux/workqueue.h> #include <linux/list.h> +#include <linux/refcount.h> /* * A cgroup can be associated with multiple css_sets as different tasks may @@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset) * can see it. Similar to atomic_dec_and_lock(), but for an * rwlock */ - if (atomic_add_unless(&cset->refcount, -1, 1)) + if (refcount_dec_not_one(&cset->refcount)) return; spin_lock_irqsave(&css_set_lock, flags); @@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset) */ static inline void get_css_set(struct css_set *cset) { - atomic_inc(&cset->refcount); + refcount_inc(&cset->refcount); } bool cgroup_ssid_enabled(int ssid); @@ -163,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_root *root, unsigned long magic, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 1dc22f6b49f5..85d75152402d 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp) spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += atomic_read(&link->cset->refcount); + count += refcount_read(&link->cset->refcount); spin_unlock_irq(&css_set_lock); return count; } @@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, struct cgroup_subsys *ss; struct dentry *dentry; int i, ret; + bool new_root = false; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, ret = -ENOMEM; goto out_unlock; } + new_root = true; init_cgroup_root(root, &opts); - ret = cgroup_setup_root(root, opts.subsys_mask); + ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); if (ret) cgroup_free_root(root); @@ -1201,6 +1203,18 @@ out_free: CGROUP_SUPER_MAGIC, ns); /* + * There's a race window after we release cgroup_mutex and before + * allocating a superblock. Make sure a concurrent process won't + * be able to re-use the root during this window by delaying the + * initialization of root refcnt. + */ + if (new_root) { + mutex_lock(&cgroup_mutex); + percpu_ref_reinit(&root->cgrp.self.refcnt); + mutex_unlock(&cgroup_mutex); + } + + /* * If @pinned_sb, we're reusing an existing root and holding an * extra ref on its sb. Mount is complete. Put the extra ref. */ @@ -1286,7 +1300,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, u64 count; rcu_read_lock(); - count = atomic_read(&task_css_set(current)->refcount); + count = refcount_read(&task_css_set(current)->refcount); rcu_read_unlock(); return count; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 687f5e0194ef..c3c9a0e1b3c9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .count = { .counter = 2, }, + .count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, @@ -436,7 +436,12 @@ out_unlock: return css; } -static void cgroup_get(struct cgroup *cgrp) +static void __maybe_unused cgroup_get(struct cgroup *cgrp) +{ + css_get(&cgrp->self); +} + +static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); css_get(&cgrp->self); @@ -554,7 +559,7 @@ EXPORT_SYMBOL_GPL(of_css); * haven't been created. */ struct css_set init_css_set = { - .refcount = ATOMIC_INIT(1), + .refcount = REFCOUNT_INIT(1), .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), @@ -724,7 +729,7 @@ void put_css_set_locked(struct css_set *cset) lockdep_assert_held(&css_set_lock); - if (!atomic_dec_and_test(&cset->refcount)) + if (!refcount_dec_and_test(&cset->refcount)) return; /* This css_set is dead. unlink it and release cgroup and css refs */ @@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) - cgroup_get(cgrp); + cgroup_get_live(cgrp); } /** @@ -977,7 +982,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, return NULL; } - atomic_set(&cset->refcount, 1); + refcount_set(&cset->refcount, 1); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->task_iters); @@ -1640,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1656,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root_cgrp->id = ret; root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, - GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, + ref_flags, GFP_KERNEL); if (ret) goto out; @@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, return ERR_PTR(-EINVAL); } cgrp_dfl_visible = true; - cgroup_get(&cgrp_dfl_root.cgrp); + cgroup_get_live(&cgrp_dfl_root.cgrp); dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); @@ -2576,7 +2581,7 @@ restart: if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; - cgroup_get(dsct); + cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); @@ -3947,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, { lockdep_assert_held(&cgroup_mutex); - cgroup_get(cgrp); + cgroup_get_live(cgrp); memset(css, 0, sizeof(*css)); css->cgroup = cgrp; @@ -4123,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); - cgroup_get(parent); + cgroup_get_live(parent); /* * @cgrp is now fully operational. If something fails after this @@ -4513,7 +4518,7 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); - BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); mutex_unlock(&cgroup_mutex); @@ -4947,7 +4952,7 @@ struct cgroup *cgroup_get_from_path(const char *path) if (kn) { if (kernfs_type(kn) == KERNFS_DIR) { cgrp = kn->priv; - cgroup_get(cgrp); + cgroup_get_live(cgrp); } else { cgrp = ERR_PTR(-ENOTDIR); } @@ -5027,6 +5032,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) /* Socket clone path */ if (skcd->val) { + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ cgroup_get(sock_cgroup_ptr(skcd)); return; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f41292be0fb..f6501f4f6040 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2121,10 +2121,8 @@ int __init cpuset_init(void) { int err = 0; - if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) - BUG(); - if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) - BUG(); + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); @@ -2139,8 +2137,7 @@ int __init cpuset_init(void) if (err < 0) return err; - if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) - BUG(); + BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); return 0; } @@ -2354,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rebuild_sched_domains(); } -void cpuset_update_active_cpus(bool cpu_online) +void cpuset_update_active_cpus(void) { /* * We're inside cpu hotplug critical region which usually nests diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 96d38dab6fb2..66129eb4371d 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) kfree(new_ns); return ERR_PTR(ret); } - atomic_set(&new_ns->count, 1); + refcount_set(&new_ns->count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b31fc05a0f1..430b0460db89 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5732,7 +5732,7 @@ static void cpuset_cpu_active(void) * cpuset configurations. */ } - cpuset_update_active_cpus(true); + cpuset_update_active_cpus(); } static int cpuset_cpu_inactive(unsigned int cpu) @@ -5755,7 +5755,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) if (overflow) return -EBUSY; - cpuset_update_active_cpus(false); + cpuset_update_active_cpus(); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f3778e2b46c8..aea3135c5d90 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } +static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, + enum cpu_usage_stat idx) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + u64_stats_update_begin(&irqtime->sync); + cpustat[idx] += delta; + irqtime->total += delta; + irqtime->tick_delta += delta; + u64_stats_update_end(&irqtime->sync); +} + /* * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. @@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void) void irqtime_account_irq(struct task_struct *curr) { struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); - u64 *cpustat = kcpustat_this_cpu->cpustat; s64 delta; int cpu; @@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr) delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; - u64_stats_update_begin(&irqtime->sync); /* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (hardirq_count()) { - cpustat[CPUTIME_IRQ] += delta; - irqtime->tick_delta += delta; - } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) { - cpustat[CPUTIME_SOFTIRQ] += delta; - irqtime->tick_delta += delta; - } - - u64_stats_update_end(&irqtime->sync); + if (hardirq_count()) + irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5cbf92214ad8..767aab3505a8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { + u64 total; u64 tick_delta; u64 irq_start_time; struct u64_stats_sync sync; @@ -1876,16 +1877,20 @@ struct irqtime { DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * and never move forward. + */ static inline u64 irq_time_read(int cpu) { struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); - u64 *cpustat = kcpustat_cpu(cpu).cpustat; unsigned int seq; u64 total; do { seq = __u64_stats_fetch_begin(&irqtime->sync); - total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ]; + total = irqtime->total; } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); return total; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b2058a7f94bd..bd8ae8d5ae9c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q) /** * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for * @rq: the source request + * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action * @@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q) * Records an action against a request. Will log the bio offset + size. * **/ -static void blk_add_trace_rq(struct request_queue *q, struct request *rq, +static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt = rq->q->blk_trace; if (likely(!bt)) return; @@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, rq->errors, 0, NULL); -} - -static void blk_add_trace_rq_abort(void *ignore, - struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); + rq->cmd_flags, what, error, 0, NULL); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); } -static void blk_add_trace_rq_complete(void *ignore, - struct request_queue *q, - struct request *rq, - unsigned int nr_bytes) +static void blk_add_trace_rq_complete(void *ignore, struct request *rq, + int error, unsigned int nr_bytes) { - blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); } /** @@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, + rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r); } @@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q, return; __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, rq->errors, len, data); + BLK_TA_DRV_DATA, 0, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -974,8 +966,6 @@ static void blk_register_tracepoints(void) { int ret; - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); - WARN_ON(ret); ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); @@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); tracepoint_synchronize_unregister(); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c0168b7da1ea..bbf46da28e9a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3209,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool) INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; + setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout, + (unsigned long)pool); setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); |