diff options
Diffstat (limited to 'kernel')
39 files changed, 634 insertions, 922 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ce967a63ede..1904e814f282 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -156,7 +156,29 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 -#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) +#define BPF_MAP_PTR_UNPRIV 1UL +#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ + POISON_POINTER_DELTA)) +#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) + +static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) +{ + return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; +} + +static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) +{ + return aux->map_state & BPF_MAP_PTR_UNPRIV; +} + +static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, + const struct bpf_map *map, bool unpriv) +{ + BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); + unpriv |= bpf_map_ptr_unpriv(aux); + aux->map_state = (unsigned long)map | + (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); +} struct bpf_call_arg_meta { struct bpf_map *map_ptr; @@ -2358,6 +2380,29 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } +static int +record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, + int func_id, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + + if (func_id != BPF_FUNC_tail_call && + func_id != BPF_FUNC_map_lookup_elem) + return 0; + if (meta->map_ptr == NULL) { + verbose(env, "kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + + if (!BPF_MAP_PTR(aux->map_state)) + bpf_map_ptr_store(aux, meta->map_ptr, + meta->map_ptr->unpriv_array); + else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) + bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, + meta->map_ptr->unpriv_array); + return 0; +} + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; @@ -2412,13 +2457,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; - if (func_id == BPF_FUNC_tail_call) { - if (meta.map_ptr == NULL) { - verbose(env, "verifier bug\n"); - return -EINVAL; - } - env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; - } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -2429,6 +2467,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + err = record_func_map(env, &meta, func_id, insn_idx); + if (err) + return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ @@ -2453,8 +2495,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - struct bpf_insn_aux_data *insn_aux; - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); @@ -2470,11 +2510,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; - insn_aux = &env->insn_aux_data[insn_idx]; - if (!insn_aux->map_ptr) - insn_aux->map_ptr = meta.map_ptr; - else if (insn_aux->map_ptr != meta.map_ptr) - insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -5470,6 +5505,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; + struct bpf_insn_aux_data *aux; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; @@ -5544,19 +5580,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + aux = &env->insn_aux_data[i + delta]; + if (!bpf_map_ptr_unpriv(aux)) + continue; + /* instead of changing every JIT dealing with tail_call * emit two extra insns: * if (index >= max_entries) goto out; * index &= array->index_mask; * to avoid out-of-bounds cpu speculation */ - map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (map_ptr == BPF_MAP_PTR_POISON) { + if (bpf_map_ptr_poisoned(aux)) { verbose(env, "tail_call abusing map_ptr\n"); return -EINVAL; } - if (!map_ptr->unpriv_array) - continue; + + map_ptr = BPF_MAP_PTR(aux->map_state); insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -5580,9 +5619,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { - map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (map_ptr == BPF_MAP_PTR_POISON || - !map_ptr->ops->map_gen_lookup) + aux = &env->insn_aux_data[i + delta]; + if (bpf_map_ptr_poisoned(aux)) + goto patch_call_imm; + + map_ptr = BPF_MAP_PTR(aux->map_state); + if (!map_ptr->ops->map_gen_lookup) goto patch_call_imm; cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b928b27050c6..0808a33d16d3 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -218,9 +218,9 @@ extern const struct proc_ns_operations cgroupns_operations; * cgroup-v1.c */ extern struct cftype cgroup1_base_files[]; -extern const struct file_operations proc_cgroupstats_operations; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; +int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); void cgroup1_release_agent(struct work_struct *work); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index a2c05d2476ac..e06c97f3ed1a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -682,7 +682,7 @@ struct cftype cgroup1_base_files[] = { }; /* Display information about each subsystem and each hierarchy */ -static int proc_cgroupstats_show(struct seq_file *m, void *v) +int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; int i; @@ -705,18 +705,6 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) return 0; } -static int cgroupstats_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_cgroupstats_show, NULL); -} - -const struct file_operations proc_cgroupstats_operations = { - .open = cgroupstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a662bfcbea0e..12883656e63e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5335,7 +5335,7 @@ int __init cgroup_init(void) WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); WARN_ON(register_filesystem(&cgroup2_fs_type)); - WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); + WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show)); return 0; } diff --git a/kernel/dma.c b/kernel/dma.c index 3506fc34a712..40f152936316 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -135,21 +135,9 @@ static int proc_dma_show(struct seq_file *m, void *v) } #endif /* MAX_DMA_CHANNELS */ -static int proc_dma_open(struct inode *inode, struct file *file) -{ - return single_open(file, proc_dma_show, NULL); -} - -static const struct file_operations proc_dma_operations = { - .open = proc_dma_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_dma_init(void) { - proc_create("dma", 0, NULL, &proc_dma_operations); + proc_create_single("dma", 0, NULL, proc_dma_show); return 0; } diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index a5697119290e..33f07c5f2515 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -27,21 +27,9 @@ static int execdomains_proc_show(struct seq_file *m, void *v) return 0; } -static int execdomains_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, execdomains_proc_show, NULL); -} - -static const struct file_operations execdomains_proc_fops = { - .open = execdomains_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int __init proc_execdomains_init(void) { - proc_create("execdomains", 0, NULL, &execdomains_proc_fops); + proc_create_single("execdomains", 0, NULL, execdomains_proc_show); return 0; } module_init(proc_execdomains_init); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 7cb091d81d91..37eda10f5c36 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -185,11 +185,6 @@ static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); } -static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode)); -} - static const struct file_operations irq_affinity_proc_fops = { .open = irq_affinity_proc_open, .read = seq_read, @@ -198,13 +193,6 @@ static const struct file_operations irq_affinity_proc_fops = { .write = irq_affinity_proc_write, }; -static const struct file_operations irq_affinity_hint_proc_fops = { - .open = irq_affinity_hint_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static const struct file_operations irq_affinity_list_proc_fops = { .open = irq_affinity_list_proc_open, .read = seq_read, @@ -223,32 +211,6 @@ static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(EFFECTIVE_LIST, m); } - -static int irq_effective_aff_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode)); -} - -static int irq_effective_aff_list_proc_open(struct inode *inode, - struct file *file) -{ - return single_open(file, irq_effective_aff_list_proc_show, - PDE_DATA(inode)); -} - -static const struct file_operations irq_effective_aff_proc_fops = { - .open = irq_effective_aff_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static const struct file_operations irq_effective_aff_list_proc_fops = { - .open = irq_effective_aff_list_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif static int default_affinity_show(struct seq_file *m, void *v) @@ -313,18 +275,6 @@ static int irq_node_proc_show(struct seq_file *m, void *v) seq_printf(m, "%d\n", irq_desc_get_node(desc)); return 0; } - -static int irq_node_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_node_proc_show, PDE_DATA(inode)); -} - -static const struct file_operations irq_node_proc_fops = { - .open = irq_node_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; #endif static int irq_spurious_proc_show(struct seq_file *m, void *v) @@ -337,18 +287,6 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) return 0; } -static int irq_spurious_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_spurious_proc_show, PDE_DATA(inode)); -} - -static const struct file_operations irq_spurious_proc_fops = { - .open = irq_spurious_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) @@ -421,24 +359,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) &irq_affinity_proc_fops, irqp); /* create /proc/irq/<irq>/affinity_hint */ - proc_create_data("affinity_hint", 0444, desc->dir, - &irq_affinity_hint_proc_fops, irqp); + proc_create_single_data("affinity_hint", 0444, desc->dir, + irq_affinity_hint_proc_show, irqp); /* create /proc/irq/<irq>/smp_affinity_list */ proc_create_data("smp_affinity_list", 0644, desc->dir, &irq_affinity_list_proc_fops, irqp); - proc_create_data("node", 0444, desc->dir, - &irq_node_proc_fops, irqp); + proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, + irqp); # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK - proc_create_data("effective_affinity", 0444, desc->dir, - &irq_effective_aff_proc_fops, irqp); - proc_create_data("effective_affinity_list", 0444, desc->dir, - &irq_effective_aff_list_proc_fops, irqp); + proc_create_single_data("effective_affinity", 0444, desc->dir, + irq_effective_aff_proc_show, irqp); + proc_create_single_data("effective_affinity_list", 0444, desc->dir, + irq_effective_aff_list_proc_show, irqp); # endif #endif - proc_create_data("spurious", 0444, desc->dir, - &irq_spurious_proc_fops, (void *)(long)irq); + proc_create_single_data("spurious", 0444, desc->dir, + irq_spurious_proc_show, (void *)(long)irq); out_unlock: mutex_unlock(®ister_lock); diff --git a/kernel/kthread.c b/kernel/kthread.c index 2017a39ab490..481951bf091d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -193,7 +193,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); void kthread_park_complete(struct task_struct *k) { - complete(&to_kthread(k)->parked); + complete_all(&to_kthread(k)->parked); } static int kthread(void *_create) @@ -459,6 +459,7 @@ void kthread_unpark(struct task_struct *k) if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); + reinit_completion(&kthread->parked); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); wake_up_state(k, TASK_PARKED); } @@ -483,9 +484,6 @@ int kthread_park(struct task_struct *k) if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; - if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) - return -EBUSY; - set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index ad69bbc9bd28..3dd980dfba2d 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -101,18 +101,6 @@ static const struct seq_operations lockdep_ops = { .show = l_show, }; -static int lockdep_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_ops); -} - -static const struct file_operations proc_lockdep_operations = { - .open = lockdep_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - #ifdef CONFIG_PROVE_LOCKING static void *lc_start(struct seq_file *m, loff_t *pos) { @@ -170,18 +158,6 @@ static const struct seq_operations lockdep_chains_ops = { .stop = lc_stop, .show = lc_show, }; - -static int lockdep_chains_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_chains_ops); -} - -static const struct file_operations proc_lockdep_chains_operations = { - .open = lockdep_chains_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; #endif /* CONFIG_PROVE_LOCKING */ static void lockdep_stats_debug_show(struct seq_file *m) @@ -355,18 +331,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) return 0; } -static int lockdep_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, lockdep_stats_show, NULL); -} - -static const struct file_operations proc_lockdep_stats_operations = { - .open = lockdep_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - #ifdef CONFIG_LOCK_STAT struct lock_stat_data { @@ -682,14 +646,11 @@ static const struct file_operations proc_lock_stat_operations = { static int __init lockdep_proc_init(void) { - proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); + proc_create_seq("lockdep", S_IRUSR, NULL, &lockdep_ops); #ifdef CONFIG_PROVE_LOCKING - proc_create("lockdep_chains", S_IRUSR, NULL, - &proc_lockdep_chains_operations); + proc_create_seq("lockdep_chains", S_IRUSR, NULL, &lockdep_chains_ops); #endif - proc_create("lockdep_stats", S_IRUSR, NULL, - &proc_lockdep_stats_operations); - + proc_create_single("lockdep_stats", S_IRUSR, NULL, lockdep_stats_show); #ifdef CONFIG_LOCK_STAT proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, &proc_lock_stat_operations); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 11b4282c2d20..1efcb5b0c3ed 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -269,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); + bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio_set_dev(bio, hib_resume_bdev); bio_set_op_attrs(bio, op, op_flags); @@ -376,7 +376,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) return -ENOSPC; if (hb) { - src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | + src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); @@ -384,7 +384,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_RECLAIM | + src = (void *)__get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (src) { @@ -691,7 +691,7 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); + page = (void *)__get_free_page(GFP_NOIO | __GFP_HIGH); if (!page) { pr_err("Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -989,7 +989,7 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_RECLAIM | __GFP_HIGH); + __get_free_page(GFP_NOIO | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; @@ -1261,8 +1261,8 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_RECLAIM | __GFP_HIGH : - __GFP_RECLAIM | __GFP_NOWARN | + GFP_NOIO | __GFP_HIGH : + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!page[i]) { diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7a693e31184a..40cea6735c2d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -270,6 +270,12 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) } } +/* Returns first leaf rcu_node of the specified RCU flavor. */ +#define rcu_first_leaf_node(rsp) ((rsp)->level[rcu_num_lvls - 1]) + +/* Is this rcu_node a leaf? */ +#define rcu_is_leaf_node(rnp) ((rnp)->level == rcu_num_lvls - 1) + /* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. @@ -284,8 +290,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * rcu_node tree with but one rcu_node structure, this loop is a no-op. */ #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) + for ((rnp) = &(rsp)->node[0]; !rcu_is_leaf_node(rsp, rnp); (rnp)++) /* * Scan the leaves of the rcu_node hierarchy for the specified rcu_state @@ -294,7 +299,7 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) * It is still a leaf node, even if it is also the root node. */ #define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + for ((rnp) = rcu_first_leaf_node(rsp); \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* @@ -486,6 +491,7 @@ void rcu_force_quiescent_state(void); void rcu_bh_force_quiescent_state(void); void rcu_sched_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; +extern struct workqueue_struct *rcu_par_gp_wq; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 88cba7c2956c..5aff271adf1e 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -404,24 +404,6 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) } /* - * Scan the specified rcu_segcblist structure for callbacks that need - * a grace period later than the one specified by "seq". We don't look - * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't - * have a grace-period sequence number. - */ -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq) -{ - int i; - - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rsclp->tails[i - 1] != rsclp->tails[i] && - ULONG_CMP_LT(seq, rsclp->gp_seq[i])) - return true; - return false; -} - -/* * Merge the source rcu_segcblist structure into the destination * rcu_segcblist structure, then initialize the source. Any pending * callbacks from the source get to start over. It is best to diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 581c12b63544..948470cef385 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -134,7 +134,5 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); -bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, - unsigned long seq); void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 777e7a6a0292..e232846516b3 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -369,7 +369,7 @@ static bool __maybe_unused torturing_tasks(void) */ static void rcu_perf_wait_shutdown(void) { - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters) return; while (!torture_must_stop()) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 680c96d8c00f..e628fcfd1bde 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -593,7 +593,12 @@ static void srcu_torture_init(void) static void srcu_torture_cleanup(void) { - cleanup_srcu_struct(&srcu_ctld); + static DEFINE_TORTURE_RANDOM(rand); + + if (torture_random(&rand) & 0x800) + cleanup_srcu_struct(&srcu_ctld); + else + cleanup_srcu_struct_quiesced(&srcu_ctld); srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ } @@ -1609,6 +1614,9 @@ static enum cpuhp_state rcutor_hp; static void rcu_torture_cleanup(void) { + int flags = 0; + unsigned long gpnum = 0; + unsigned long completed = 0; int i; rcutorture_record_test_transition(); @@ -1639,6 +1647,11 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } + rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, + &flags, &gpnum, &completed); + pr_alert("%s: End-test grace-period state: g%lu c%lu f%#x\n", + cur_ops->name, gpnum, completed, flags); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); for (i = 0; i < ncbflooders; i++) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 76ac5f50b2c7..622792abe41a 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -86,16 +86,19 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void cleanup_srcu_struct(struct srcu_struct *sp) +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) { WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); - flush_work(&sp->srcu_work); + if (quiesced) + WARN_ON(work_pending(&sp->srcu_work)); + else + flush_work(&sp->srcu_work); WARN_ON(sp->srcu_gp_running); WARN_ON(sp->srcu_gp_waiting); WARN_ON(sp->srcu_cb_head); WARN_ON(&sp->srcu_cb_head != sp->srcu_cb_tail); } -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); /* * Removes the count for the old reader from the appropriate element of diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index fb560fca9ef4..b4123d7a2cec 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -366,24 +366,28 @@ static unsigned long srcu_get_delay(struct srcu_struct *sp) return SRCU_INTERVAL; } -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) +/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ +void _cleanup_srcu_struct(struct srcu_struct *sp, bool quiesced) { int cpu; if (WARN_ON(!srcu_get_delay(sp))) - return; /* Leakage unless caller handles error. */ + return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - flush_delayed_work(&sp->work); + return; /* Just leak it! */ + if (quiesced) { + if (WARN_ON(delayed_work_pending(&sp->work))) + return; /* Just leak it! */ + } else { + flush_delayed_work(&sp->work); + } for_each_possible_cpu(cpu) - flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + if (quiesced) { + if (WARN_ON(delayed_work_pending(&per_cpu_ptr(sp->sda, cpu)->work))) + return; /* Just leak it! */ + } else { + flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + } if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(sp))) { pr_info("%s: Active srcu_struct %p state: %d\n", __func__, sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); @@ -392,7 +396,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) free_percpu(sp->sda); sp->sda = NULL; } -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a734692a581..aa7cade1b9f3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -524,8 +524,6 @@ module_param(rcu_kick_kthreads, bool, 0644); static ulong jiffies_till_sched_qs = HZ / 10; module_param(jiffies_till_sched_qs, ulong, 0444); -static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(void); @@ -711,44 +709,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) } /* - * Is there any need for future grace periods? - * Interrupts must be disabled. If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_future_needs_gp(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; - int *fp = &rnp->need_future_gp[idx]; - - lockdep_assert_irqs_disabled(); - return READ_ONCE(*fp); -} - -/* - * Does the current CPU require a not-yet-started grace period? - * The caller must have disabled interrupts to prevent races with - * normal callback registry. - */ -static bool -cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) -{ - lockdep_assert_irqs_disabled(); - if (rcu_gp_in_progress(rsp)) - return false; /* No, a grace period is already in progress. */ - if (rcu_future_needs_gp(rsp)) - return true; /* Yes, a no-CBs CPU needs one. */ - if (!rcu_segcblist_is_enabled(&rdp->cblist)) - return false; /* No, this is a no-CBs (or offline) CPU. */ - if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) - return true; /* Yes, CPU has newly registered callbacks. */ - if (rcu_segcblist_future_gp_needed(&rdp->cblist, - READ_ONCE(rsp->completed))) - return true; /* Yes, CBs for future grace period. */ - return false; /* No grace period needed. */ -} - -/* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * @@ -1234,10 +1194,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * Has this CPU encountered a cond_resched_rcu_qs() since the - * beginning of the grace period? For this to be the case, - * the CPU has to have noticed the current grace period. This - * might not be the case for nohz_full CPUs looping in the kernel. + * Has this CPU encountered a cond_resched() since the beginning + * of the grace period? For this to be the case, the CPU has to + * have noticed the current grace period. This might not be the + * case for nohz_full CPUs looping in the kernel. */ jtsq = jiffies_till_sched_qs; ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); @@ -1642,18 +1602,30 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, return rnp->completed + 1; /* + * If the current rcu_node structure believes that RCU is + * idle, and if the rcu_state structure does not yet reflect + * the start of a new grace period, then the next grace period + * will suffice. The memory barrier is needed to accurately + * sample the rsp->gpnum, and pairs with the second lock + * acquisition in rcu_gp_init(), which is augmented with + * smp_mb__after_unlock_lock() for this purpose. + */ + if (rnp->gpnum == rnp->completed) { + smp_mb(); /* See above block comment. */ + if (READ_ONCE(rsp->gpnum) == rnp->completed) + return rnp->completed + 1; + } + + /* * Otherwise, wait for a possible partial grace period and * then the subsequent full grace period. */ return rnp->completed + 2; } -/* - * Trace-event helper function for rcu_start_future_gp() and - * rcu_nocb_wait_gp(). - */ -static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) +/* Trace-event wrapper function for trace_rcu_future_grace_period. */ +static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, const char *s) { trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, c, rnp->level, @@ -1661,96 +1633,67 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } /* - * Start some future grace period, as needed to handle newly arrived + * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. Returns true if there + * rcu_node structure's ->need_future_gp[] field. Returns true if there * is reason to awaken the grace-period kthread. * - * The caller must hold the specified rcu_node structure's ->lock. + * The caller must hold the specified rcu_node structure's ->lock, which + * is why the caller is responsible for waking the grace-period kthread. */ -static bool __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long *c_out) +static bool rcu_start_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c) { - unsigned long c; bool ret = false; - struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); - - raw_lockdep_assert_held_rcu_node(rnp); - - /* - * Pick up grace-period number for new callbacks. If this - * grace period is already marked as needed, return to the caller. - */ - c = rcu_cbs_completed(rdp->rsp, rnp); - trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); - if (rnp->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); - goto out; - } + struct rcu_state *rsp = rdp->rsp; + struct rcu_node *rnp_root; /* - * If either this rcu_node structure or the root rcu_node structure - * believe that a grace period is in progress, then we must wait - * for the one following, which is in "c". Because our request - * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. We only do the lockless check - * of rnp_root's fields if the current rcu_node structure thinks - * there is no grace period in flight, and because we hold rnp->lock, - * the only possible change is when rnp_root's two fields are - * equal, in which case rnp_root->gpnum might be concurrently - * incremented. But that is OK, as it will just result in our - * doing some extra useless work. + * Use funnel locking to either acquire the root rcu_node + * structure's lock or bail out if the need for this grace period + * has already been recorded -- or has already started. If there + * is already a grace period in progress in a non-leaf node, no + * recording is needed because the end of the grace period will + * scan the leaf rcu_node structures. Note that rnp->lock must + * not be released. */ - if (rnp->gpnum != rnp->completed || - READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) { - rnp->need_future_gp[c & 0x1]++; - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); - goto out; + raw_lockdep_assert_held_rcu_node(rnp); + trace_rcu_this_gp(rnp, rdp, c, TPS("Startleaf")); + for (rnp_root = rnp; 1; rnp_root = rnp_root->parent) { + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); + WARN_ON_ONCE(ULONG_CMP_LT(rnp_root->gpnum + + need_future_gp_mask(), c)); + if (need_future_gp_element(rnp_root, c) || + ULONG_CMP_GE(rnp_root->gpnum, c) || + (rnp != rnp_root && + rnp_root->gpnum != rnp_root->completed)) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Prestarted")); + goto unlock_out; + } + need_future_gp_element(rnp_root, c) = true; + if (rnp_root != rnp && rnp_root->parent != NULL) + raw_spin_unlock_rcu_node(rnp_root); + if (!rnp_root->parent) + break; /* At root, and perhaps also leaf. */ } - /* - * There might be no grace period in progress. If we don't already - * hold it, acquire the root rcu_node structure's lock in order to - * start one (if needed). - */ - if (rnp != rnp_root) - raw_spin_lock_rcu_node(rnp_root); - - /* - * Get a new grace-period number. If there really is no grace - * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. - */ - c = rcu_cbs_completed(rdp->rsp, rnp_root); - if (!rcu_is_nocb_cpu(rdp->cpu)) - (void)rcu_segcblist_accelerate(&rdp->cblist, c); - - /* - * If the needed for the required grace period is already - * recorded, trace and leave. - */ - if (rnp_root->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); + /* If GP already in progress, just leave, otherwise start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedleafroot")); goto unlock_out; } - - /* Record the need for the future grace period. */ - rnp_root->need_future_gp[c & 0x1]++; - - /* If a grace period is not already in progress, start one. */ - if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); - } else { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + trace_rcu_this_gp(rnp_root, rdp, c, TPS("Startedroot")); + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); + if (!rsp->gp_kthread) { + trace_rcu_this_gp(rnp_root, rdp, c, TPS("NoGPkthread")); + goto unlock_out; } + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); + ret = true; /* Caller must wake GP kthread. */ unlock_out: if (rnp != rnp_root) raw_spin_unlock_rcu_node(rnp_root); -out: - if (c_out != NULL) - *c_out = c; return ret; } @@ -1758,16 +1701,16 @@ out: * Clean up any old requests for the just-ended grace period. Also return * whether any additional grace periods have been requested. */ -static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - int c = rnp->completed; - int needmore; + unsigned long c = rnp->completed; + bool needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rnp->need_future_gp[c & 0x1] = 0; - needmore = rnp->need_future_gp[(c + 1) & 0x1]; - trace_rcu_future_gp(rnp, rdp, c, - needmore ? TPS("CleanupMore") : TPS("Cleanup")); + need_future_gp_element(rnp, c) = false; + needmore = need_any_future_gp(rnp); + trace_rcu_this_gp(rnp, rdp, c, + needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1802,6 +1745,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { + unsigned long c; bool ret = false; raw_lockdep_assert_held_rcu_node(rnp); @@ -1820,8 +1764,9 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, * accelerating callback invocation to an earlier grace-period * number. */ - if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) - ret = rcu_start_future_gp(rnp, rdp, NULL); + c = rcu_cbs_completed(rsp, rnp); + if (rcu_segcblist_accelerate(&rdp->cblist, c)) + ret = rcu_start_this_gp(rnp, rdp, c); /* Trace depending on how much we were able to accelerate. */ if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) @@ -2049,7 +1994,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq_rcu_node(rnp); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } @@ -2108,7 +2053,6 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; bool needgp = false; - int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); struct swait_queue_head *sq; @@ -2147,31 +2091,35 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) if (rnp == rdp->mynode) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ - nocb += rcu_future_gp_cleanup(rsp, rnp); + needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp; sq = rcu_nocb_gp_get(rnp); raw_spin_unlock_irq_rcu_node(rnp); rcu_nocb_gp_cleanup(sq); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ - rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->gp_state = RCU_GP_IDLE; + /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(rsp->rda); + if (need_any_future_gp(rnp)) { + trace_rcu_this_gp(rnp, rdp, rsp->completed - 1, + TPS("CleanupMore")); + needgp = true; + } /* Advance CBs to reduce false positives below. */ - needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; - if (needgp || cpu_needs_another_gp(rsp, rdp)) { + if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, - READ_ONCE(rsp->gpnum), + trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("newreq")); } + WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); raw_spin_unlock_irq_rcu_node(rnp); } @@ -2202,7 +2150,7 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, @@ -2247,7 +2195,7 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsend")); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); ret = 0; /* Force full wait till next FQS. */ j = jiffies_till_next_fqs; @@ -2260,7 +2208,7 @@ static int __noreturn rcu_gp_kthread(void *arg) } } else { /* Deal with stray signal. */ - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); WARN_ON(signal_pending(current)); trace_rcu_grace_period(rsp->name, @@ -2283,71 +2231,6 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock and hard irqs must be disabled. - * - * Note that it is legal for a dying CPU (which is marked as offline) to - * invoke this function. This can happen when the dying CPU reports its - * quiescent state. - * - * Returns true if the grace-period kthread must be awakened. - */ -static bool -rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) -{ - raw_lockdep_assert_held_rcu_node(rnp); - if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { - /* - * Either we have not yet spawned the grace-period - * task, this CPU does not need another grace period, - * or a grace period is already in progress. - * Either way, don't start a new grace period. - */ - return false; - } - WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); - trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), - TPS("newreq")); - - /* - * We can't do wakeups while holding the rnp->lock, as that - * could cause possible deadlocks with the rq->lock. Defer - * the wakeup to our caller. - */ - return true; -} - -/* - * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's - * callbacks. Note that rcu_start_gp_advanced() cannot do this because it - * is invoked indirectly from rcu_advance_cbs(), which would result in - * endless recursion -- or would do so if it wasn't for the self-deadlock - * that is encountered beforehand. - * - * Returns true if the grace-period kthread needs to be awakened. - */ -static bool rcu_start_gp(struct rcu_state *rsp) -{ - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - bool ret = false; - - /* - * If there is no grace period in progress right now, any - * callbacks we have up to this point will be satisfied by the - * next grace period. Also, advancing the callbacks reduces the - * probability of false positives from cpu_needs_another_gp() - * resulting in pointless grace periods. So, advance callbacks - * then start the grace period! - */ - ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; - ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; - return ret; -} - -/* * Report a full set of quiescent states to the specified rcu_state data * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period * kthread if another grace period is required. Whether we wake @@ -2398,7 +2281,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, @@ -2782,7 +2665,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { @@ -2874,22 +2757,27 @@ __rcu_process_callbacks(struct rcu_state *rsp) unsigned long flags; bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); + struct rcu_node *rnp; WARN_ON_ONCE(!rdp->beenonline); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); - /* Does this CPU require a not-yet-started grace period? */ - local_irq_save(flags); - if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ - needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); - if (needwake) - rcu_gp_kthread_wake(rsp); - } else { - local_irq_restore(flags); + /* No grace period and unregistered callbacks? */ + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist)) { + local_irq_save(flags); + if (rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) { + local_irq_restore(flags); + } else { + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); + } } /* If there are callbacks ready, invoke them. */ @@ -2973,11 +2861,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp_root = rcu_get_root(rsp); + struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp_root); - needwake = rcu_start_gp(rsp); - raw_spin_unlock_rcu_node(rnp_root); + raw_spin_lock_rcu_node(rnp); + needwake = rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock_rcu_node(rnp); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3368,7 +3256,9 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (!rcu_gp_in_progress(rsp) && + rcu_segcblist_is_enabled(&rdp->cblist) && + !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; /* Has another RCU grace period completed? */ @@ -3775,6 +3665,8 @@ int rcutree_dead_cpu(unsigned int cpu) return 0; } +static DEFINE_PER_CPU(int, rcu_cpu_started); + /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -3796,6 +3688,11 @@ void rcu_cpu_starting(unsigned int cpu) struct rcu_node *rnp; struct rcu_state *rsp; + if (per_cpu(rcu_cpu_started, cpu)) + return; + + per_cpu(rcu_cpu_started, cpu) = 1; + for_each_rcu_flavor(rsp) { rdp = per_cpu_ptr(rsp->rda, cpu); rnp = rdp->mynode; @@ -3852,6 +3749,8 @@ void rcu_report_dead(unsigned int cpu) preempt_enable(); for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); + + per_cpu(rcu_cpu_started, cpu) = 0; } /* Migrate the dead CPU's callbacks to the current CPU. */ @@ -3861,6 +3760,7 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) struct rcu_data *my_rdp; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + bool needwake; if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -3872,12 +3772,15 @@ static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) return; } raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ - rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ - rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + /* Leverage recent GPs and set GP for new callbacks. */ + needwake = rcu_advance_cbs(rsp, rnp_root, rdp) || + rcu_advance_cbs(rsp, rnp_root, my_rdp); rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + if (needwake) + rcu_gp_kthread_wake(rsp); WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || !rcu_segcblist_empty(&rdp->cblist), "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", @@ -4056,7 +3959,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) init_swait_queue_head(&rsp->gp_wq); init_swait_queue_head(&rsp->expedited_wq); - rnp = rsp->level[rcu_num_lvls - 1]; + rnp = rcu_first_leaf_node(rsp); for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; @@ -4168,6 +4071,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) } struct workqueue_struct *rcu_gp_wq; +struct workqueue_struct *rcu_par_gp_wq; void __init rcu_init(void) { @@ -4199,6 +4103,8 @@ void __init rcu_init(void) /* Create workqueue for expedited GPs and for Tree SRCU. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); } #include "tree_exp.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f491ab4f2e8e..78e051dffc5b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -58,6 +58,14 @@ struct rcu_dynticks { #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; +/* Communicate arguments to a workqueue handler. */ +struct rcu_exp_work { + smp_call_func_t rew_func; + struct rcu_state *rew_rsp; + unsigned long rew_s; + struct work_struct rew_work; +}; + /* RCU's kthread states for tracing. */ #define RCU_KTHREAD_STOPPED 0 #define RCU_KTHREAD_RUNNING 1 @@ -150,15 +158,32 @@ struct rcu_node { struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - int need_future_gp[2]; - /* Counts of upcoming no-CB GP requests. */ + u8 need_future_gp[4]; /* Counts of upcoming GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; spinlock_t exp_lock ____cacheline_internodealigned_in_smp; unsigned long exp_seq_rq; wait_queue_head_t exp_wq[4]; + struct rcu_exp_work rew; + bool exp_need_flush; /* Need to flush workitem? */ } ____cacheline_internodealigned_in_smp; +/* Accessors for ->need_future_gp[] array. */ +#define need_future_gp_mask() \ + (ARRAY_SIZE(((struct rcu_node *)NULL)->need_future_gp) - 1) +#define need_future_gp_element(rnp, c) \ + ((rnp)->need_future_gp[(c) & need_future_gp_mask()]) +#define need_any_future_gp(rnp) \ +({ \ + int __i; \ + bool __nonzero = false; \ + \ + for (__i = 0; __i < ARRAY_SIZE((rnp)->need_future_gp); __i++) \ + __nonzero = __nonzero || \ + READ_ONCE((rnp)->need_future_gp[__i]); \ + __nonzero; \ +}) + /* * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and * are indexed relative to this interval rather than the global CPU ID space. @@ -224,10 +249,6 @@ struct rcu_data { #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - atomic_long_t exp_workdone0; /* # done by workqueue. */ - atomic_long_t exp_workdone1; /* # done by others #1. */ - atomic_long_t exp_workdone2; /* # done by others #2. */ - atomic_long_t exp_workdone3; /* # done by others #3. */ int exp_dynticks_snap; /* Double-check need for IPI. */ /* 6) Callback offloading. */ @@ -408,7 +429,6 @@ extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ int rcu_dynticks_snap(struct rcu_dynticks *rdtp); -bool rcu_eqs_special_set(int cpu); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -438,7 +458,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); #ifdef CONFIG_RCU_BOOST -static void rcu_preempt_do_callbacks(void); static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -454,7 +473,6 @@ static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f72eefab8543..d40708e8c5d6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -20,6 +20,8 @@ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#include <linux/lockdep.h> + /* * Record the start of an expedited grace period. */ @@ -154,15 +156,35 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold the rcu_state's exp_mutex. + * Caller must hold the specificed rcu_node structure's ->lock */ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) { + raw_lockdep_assert_held_rcu_node(rnp); + return rnp->exp_tasks == NULL && READ_ONCE(rnp->expmask) == 0; } /* + * Like sync_rcu_preempt_exp_done(), but this function assumes the caller + * doesn't hold the rcu_node's ->lock, and will acquire and release the lock + * itself + */ +static bool sync_rcu_preempt_exp_done_unlocked(struct rcu_node *rnp) +{ + unsigned long flags; + bool ret; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + ret = sync_rcu_preempt_exp_done(rnp); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + return ret; +} + + +/* * Report the exit from RCU read-side critical section for the last task * that queued itself during or before the current expedited preemptible-RCU * grace period. This event is reported either to the rcu_node structure on @@ -170,8 +192,7 @@ static bool sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. + * Caller must hold the specified rcu_node structure's ->lock. */ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake, unsigned long flags) @@ -207,8 +228,6 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, /* * Report expedited quiescent state for specified node. This is a * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. */ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -221,8 +240,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, /* * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. + * specified leaf rcu_node structure. */ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long mask, bool wake) @@ -248,14 +266,12 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, } /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, - unsigned long s) +static bool sync_exp_work_done(struct rcu_state *rsp, unsigned long s) { if (rcu_exp_gp_seq_done(rsp, s)) { trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); /* Ensure test happens before caller kfree(). */ smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); return true; } return false; @@ -289,7 +305,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * promoting locality and is not strictly needed for correctness. */ for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + if (sync_exp_work_done(rsp, s)) return true; /* Work not done, either wait here or go up. */ @@ -302,8 +318,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp->grplo, rnp->grphi, TPS("wait")); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone2, s)); + sync_exp_work_done(rsp, s)); return true; } rnp->exp_seq_rq = s; /* Followers can wait on us. */ @@ -313,7 +328,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) } mutex_lock(&rsp->exp_mutex); fastpath: - if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + if (sync_exp_work_done(rsp, s)) { mutex_unlock(&rsp->exp_mutex); return true; } @@ -362,93 +377,129 @@ static void sync_sched_exp_online_cleanup(int cpu) } /* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. + * Select the CPUs within the specified rcu_node that the upcoming + * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) +static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) { int cpu; unsigned long flags; + smp_call_func_t func; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; - struct rcu_node *rnp; - - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); - sync_exp_reset_tree(rsp); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); + struct rcu_exp_work *rewp = + container_of(wp, struct rcu_exp_work, rew_work); + struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); + struct rcu_state *rsp = rewp->rew_rsp; - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); - int snap; + func = rewp->rew_func; + raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (raw_smp_processor_id() == cpu || - !(rnp->qsmaskinitnext & mask)) { + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu); + int snap; + + if (raw_smp_processor_id() == cpu || + !(rnp->qsmaskinitnext & mask)) { + mask_ofl_test |= mask; + } else { + snap = rcu_dynticks_snap(rdtp); + if (rcu_dynticks_in_eqs(snap)) mask_ofl_test |= mask; - } else { - snap = rcu_dynticks_snap(rdtp); - if (rcu_dynticks_in_eqs(snap)) - mask_ofl_test |= mask; - else - rdp->exp_dynticks_snap = snap; - } + else + rdp->exp_dynticks_snap = snap; } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* IPI the remaining CPUs for expedited quiescent state. */ - for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { - unsigned long mask = leaf_node_cpu_bit(rnp, cpu); - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (!(mask_ofl_ipi & mask)) - continue; + /* IPI the remaining CPUs for expedited quiescent state. */ + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + + if (!(mask_ofl_ipi & mask)) + continue; retry_ipi: - if (rcu_dynticks_in_eqs_since(rdp->dynticks, - rdp->exp_dynticks_snap)) { - mask_ofl_test |= mask; - continue; - } - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with CPU hotplug operation. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if ((rnp->qsmaskinitnext & mask) && - (rnp->expmask & mask)) { - /* Online, so delay for a bit and try again. */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); - schedule_timeout_uninterruptible(1); - goto retry_ipi; - } - /* CPU really is offline, so we can ignore it. */ - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + if (rcu_dynticks_in_eqs_since(rdp->dynticks, + rdp->exp_dynticks_snap)) { + mask_ofl_test |= mask; + continue; + } + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with CPU hotplug operation. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if ((rnp->qsmaskinitnext & mask) && + (rnp->expmask & mask)) { + /* Online, so delay for a bit and try again. */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl")); + schedule_timeout_uninterruptible(1); + goto retry_ipi; + } + /* CPU really is offline, so we can ignore it. */ + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + struct rcu_node *rnp; + + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); + sync_exp_reset_tree(rsp); + trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select")); + + /* Schedule work for each leaf rcu_node structure. */ + rcu_for_each_leaf_node(rsp, rnp) { + rnp->exp_need_flush = false; + if (!READ_ONCE(rnp->expmask)) + continue; /* Avoid early boot non-existent wq. */ + rnp->rew.rew_func = func; + rnp->rew.rew_rsp = rsp; + if (!READ_ONCE(rcu_par_gp_wq) || + rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { + /* No workqueues yet. */ + sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); + continue; } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work); + rnp->exp_need_flush = true; } + + /* Wait for workqueue jobs (if any) to complete. */ + rcu_for_each_leaf_node(rsp, rnp) + if (rnp->exp_need_flush) + flush_work(&rnp->rew.rew_work); } static void synchronize_sched_expedited_wait(struct rcu_state *rsp) @@ -469,9 +520,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) for (;;) { ret = swait_event_timeout( rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), + sync_rcu_preempt_exp_done_unlocked(rnp_root), jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + if (ret > 0 || sync_rcu_preempt_exp_done_unlocked(rnp_root)) return; WARN_ON(ret < 0); /* workqueues should not be signaled. */ if (rcu_cpu_stall_suppress) @@ -504,7 +555,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) + if (sync_rcu_preempt_exp_done_unlocked(rnp)) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, @@ -560,14 +611,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) mutex_unlock(&rsp->exp_wake_mutex); } -/* Let the workqueue handler know what it is supposed to do. */ -struct rcu_exp_work { - smp_call_func_t rew_func; - struct rcu_state *rew_rsp; - unsigned long rew_s; - struct work_struct rew_work; -}; - /* * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. @@ -633,7 +676,7 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); rnp = rcu_get_root(rsp); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); + sync_exp_work_done(rsp, s)); smp_mb(); /* Workqueue actions happen before return. */ /* Let the next expedited grace period start. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 84fbee4686d3..7fd12039e512 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -182,7 +182,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) raw_lockdep_assert_held_rcu_node(rnp); WARN_ON_ONCE(rdp->mynode != rnp); - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); + WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); /* * Decide where to queue the newly blocked task. In theory, @@ -384,6 +384,50 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) } /* + * Preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* critical section after entry code. */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting != 1) { + --t->rcu_read_lock_nesting; + } else { + barrier(); /* critical section before exit code. */ + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } +#ifdef CONFIG_PROVE_LOCKING + { + int rrln = READ_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* * Advance a ->blkd_tasks-list pointer to the next entry, instead * returning NULL if at the end of the list. */ @@ -489,7 +533,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); - WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); + WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -685,15 +729,6 @@ static void rcu_preempt_check_callbacks(void) t->rcu_read_unlock_special.b.need_qs = true; } -#ifdef CONFIG_RCU_BOOST - -static void rcu_preempt_do_callbacks(void) -{ - rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p)); -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - /** * call_rcu() - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -1140,7 +1175,7 @@ static void rcu_kthread_do_work(void) { rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); - rcu_preempt_do_callbacks(); + rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); } static void rcu_cpu_kthread_setup(unsigned int cpu) @@ -1607,7 +1642,7 @@ static int rcu_oom_notify(struct notifier_block *self, for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); } /* Unconditionally decrement: no need to wake ourselves up. */ @@ -1780,19 +1815,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) swake_up_all(sq); } -/* - * Set the root rcu_node structure's ->need_future_gp field - * based on the sum of those of all rcu_node structures. This does - * double-count the root rcu_node structure's requests, but this - * is necessary to handle the possibility of a rcu_nocb_kthread() - * having awakened during the time that the rcu_node structures - * were being updated for the end of the previous grace period. - */ -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ - rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return &rnp->nocb_gp_wq[rnp->completed & 0x1]; @@ -1966,7 +1988,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; @@ -2048,7 +2070,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) struct rcu_node *rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - needwake = rcu_start_future_gp(rnp, rdp, &c); + c = rcu_cbs_completed(rdp->rsp, rnp); + needwake = rcu_start_this_gp(rnp, rdp, c); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); @@ -2057,7 +2080,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2065,9 +2088,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; WARN_ON(signal_pending(current)); - trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); } - trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); + trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } @@ -2236,7 +2259,7 @@ static int rcu_nocb_kthread(void *arg) cl++; c++; local_bh_enable(); - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); list = next; } trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); @@ -2292,7 +2315,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) void __init rcu_init_nohz(void) { int cpu; - bool need_rcu_nocb_mask = true; + bool need_rcu_nocb_mask = false; struct rcu_state *rsp; #if defined(CONFIG_NO_HZ_FULL) @@ -2315,7 +2338,7 @@ void __init rcu_init_nohz(void) #endif /* #if defined(CONFIG_NO_HZ_FULL) */ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); cpumask_and(rcu_nocb_mask, cpu_possible_mask, rcu_nocb_mask); } @@ -2495,10 +2518,6 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ -} - static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) { return NULL; @@ -2587,8 +2606,7 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp) } /* - * Bind the grace-period kthread for the sysidle flavor of RCU to the - * timekeeping CPU. + * Bind the RCU grace-period kthreads to the housekeeping CPU. */ static void rcu_bind_gp_kthread(void) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 68fa19a5e7bd..4c230a60ece4 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -226,54 +226,6 @@ core_initcall(rcu_set_runtime_mode); #endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ -#ifdef CONFIG_PREEMPT_RCU - -/* - * Preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* critical section after entry code. */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -/* - * Preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting != 1) { - --t->rcu_read_lock_nesting; - } else { - barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = READ_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = @@ -624,7 +576,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); * grace period has elapsed, in other words after all currently * executing rcu-tasks read-side critical sections have elapsed. These * read-side critical sections are delimited by calls to schedule(), - * cond_resched_rcu_qs(), idle execution, userspace execution, calls + * cond_resched_tasks_rcu_qs(), idle execution, userspace execution, calls * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). * * This is a very specialized primitive, intended only for a few uses in diff --git a/kernel/resource.c b/kernel/resource.c index 2af6c03858b9..b589dda910b3 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -87,7 +87,7 @@ enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { - struct resource *p = m->private; + struct resource *p = PDE_DATA(file_inode(m->file)); loff_t l = 0; read_lock(&resource_lock); for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) @@ -103,7 +103,7 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct resource *root = m->private; + struct resource *root = PDE_DATA(file_inode(m->file)); struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; @@ -135,44 +135,11 @@ static const struct seq_operations resource_op = { .show = r_show, }; -static int ioports_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &ioport_resource; - } - return res; -} - -static int iomem_open(struct inode *inode, struct file *file) -{ - int res = seq_open(file, &resource_op); - if (!res) { - struct seq_file *m = file->private_data; - m->private = &iomem_resource; - } - return res; -} - -static const struct file_operations proc_ioports_operations = { - .open = ioports_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations proc_iomem_operations = { - .open = iomem_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init ioresources_init(void) { - proc_create("ioports", 0, NULL, &proc_ioports_operations); - proc_create("iomem", 0, NULL, &proc_iomem_operations); + proc_create_seq_data("ioports", 0, NULL, &resource_op, + &ioport_resource); + proc_create_seq_data("iomem", 0, NULL, &resource_op, &iomem_resource); return 0; } __initcall(ioresources_init); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 092f7c4de903..e27034bd954e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -881,6 +881,33 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} + +/* + * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * __set_cpus_allowed_ptr() and select_fallback_rq(). + */ +static inline bool is_cpu_allowed(struct task_struct *p, int cpu) +{ + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + return false; + + if (is_per_cpu_kthread(p)) + return cpu_online(cpu); + + return cpu_active(cpu); +} + /* * This is how migration works: * @@ -938,16 +965,8 @@ struct migration_arg { static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) { - if (p->flags & PF_KTHREAD) { - if (unlikely(!cpu_online(dest_cpu))) - return rq; - } else { - if (unlikely(!cpu_active(dest_cpu))) - return rq; - } - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (!is_cpu_allowed(p, dest_cpu)) return rq; update_rq_clock(rq); @@ -1476,10 +1495,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, &p->cpus_allowed) { - if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) - continue; - if (!cpu_online(dest_cpu)) + if (!is_cpu_allowed(p, dest_cpu)) continue; + goto out; } @@ -1542,8 +1560,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) * [ this allows ->select_task() to simply return task_cpu(p) and * not worry about this generic constraint ] */ - if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || - !cpu_online(cpu))) + if (unlikely(!is_cpu_allowed(p, cpu))) cpu = select_fallback_rq(task_cpu(p), p); return cpu; @@ -5008,20 +5025,6 @@ int __cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(__cond_resched_lock); -int __sched __cond_resched_softirq(void) -{ - BUG_ON(!in_softirq()); - - if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { - local_bh_enable(); - preempt_schedule_common(); - local_bh_disable(); - return 1; - } - return 0; -} -EXPORT_SYMBOL(__cond_resched_softirq); - /** * yield - yield the current processor to other threads. * diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1356afd1eeb6..fbfc3f1d368a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1259,6 +1259,9 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) rq = task_rq_lock(p, &rf); + sched_clock_tick(); + update_rq_clock(rq); + if (!dl_task(p) || p->state == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); @@ -1278,9 +1281,6 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) if (dl_se->dl_non_contending == 0) goto unlock; - sched_clock_tick(); - update_rq_clock(rq); - sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 15b10e210a6b..e593b4118578 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -823,35 +823,9 @@ static const struct seq_operations sched_debug_sops = { .show = sched_debug_show, }; -static int sched_debug_release(struct inode *inode, struct file *file) -{ - seq_release(inode, file); - - return 0; -} - -static int sched_debug_open(struct inode *inode, struct file *filp) -{ - int ret = 0; - - ret = seq_open(filp, &sched_debug_sops); - - return ret; -} - -static const struct file_operations sched_debug_fops = { - .open = sched_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = sched_debug_release, -}; - static int __init init_sched_debug_procfs(void) { - struct proc_dir_entry *pe; - - pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops); - if (!pe) + if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops)) return -ENOMEM; return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1f0a4bc6a39d..cb467c221b15 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -983,7 +983,7 @@ static inline void rq_clock_skip_update(struct rq *rq) } /* - * See rt task throttoling, which is the only time a skip + * See rt task throttling, which is the only time a skip * request is cancelled. */ static inline void rq_clock_cancel_skipupdate(struct rq *rq) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index ab112cbfd7c8..750fb3c67eed 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -120,22 +120,9 @@ static const struct seq_operations schedstat_sops = { .show = show_schedstat, }; -static int schedstat_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &schedstat_sops); -} - -static const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static int __init proc_schedstat_init(void) { - proc_create("schedstat", 0, NULL, &proc_schedstat_operations); - + proc_create_seq("schedstat", 0, NULL, &schedstat_sops); return 0; } subsys_initcall(proc_schedstat_init); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 64cc564f5255..61a1125c1ae4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1708,7 +1708,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att rcu_read_unlock(); if (rq && sched_debug_enabled) { - pr_info("span: %*pbl (max cpu_capacity = %lu)\n", + pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); } diff --git a/kernel/signal.c b/kernel/signal.c index 9c33163a6165..0f865d67415d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1539,7 +1539,6 @@ int send_sig_fault(int sig, int code, void __user *addr return send_sig_info(info.si_signo, &info, t); } -#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR) int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) { struct siginfo info; @@ -1568,9 +1567,7 @@ int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct * return send_sig_info(info.si_signo, &info, t); } EXPORT_SYMBOL(send_sig_mceerr); -#endif -#ifdef SEGV_BNDERR int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) { struct siginfo info; @@ -1584,7 +1581,6 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) info.si_upper = upper; return force_sig_info(info.si_signo, &info, current); } -#endif #ifdef SEGV_PKUERR int force_sig_pkuerr(void __user *addr, u32 pkey) @@ -2837,8 +2833,19 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) [SIGPOLL] = { NSIGPOLL, SIL_POLL }, [SIGSYS] = { NSIGSYS, SIL_SYS }, }; - if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) + if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) { layout = filter[sig].layout; + /* Handle the exceptions */ + if ((sig == SIGBUS) && + (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO)) + layout = SIL_FAULT_MCEERR; + else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR)) + layout = SIL_FAULT_BNDERR; +#ifdef SEGV_PKUERR + else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR)) + layout = SIL_FAULT_PKUERR; +#endif + } else if (si_code <= NSIGPOLL) layout = SIL_POLL; } else { @@ -2848,104 +2855,15 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) layout = SIL_POLL; else if (si_code < 0) layout = SIL_RT; - /* Tests to support buggy kernel ABIs */ -#ifdef TRAP_FIXME - if ((sig == SIGTRAP) && (si_code == TRAP_FIXME)) - layout = SIL_FAULT; -#endif -#ifdef FPE_FIXME - if ((sig == SIGFPE) && (si_code == FPE_FIXME)) - layout = SIL_FAULT; -#endif } return layout; } int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) { - int err; - - if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) + if (copy_to_user(to, from , sizeof(struct siginfo))) return -EFAULT; - if (from->si_code < 0) - return __copy_to_user(to, from, sizeof(siginfo_t)) - ? -EFAULT : 0; - /* - * If you change siginfo_t structure, please be sure - * this code is fixed accordingly. - * Please remember to update the signalfd_copyinfo() function - * inside fs/signalfd.c too, in case siginfo_t changes. - * It should never copy any pad contained in the structure - * to avoid security leaks, but must copy the generic - * 3 ints plus the relevant union member. - */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_TIMER: - /* Unreached SI_TIMER is negative */ - break; - case SIL_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_FAULT: - err |= __put_user(from->si_addr, &to->si_addr); -#ifdef __ARCH_SI_TRAPNO - err |= __put_user(from->si_trapno, &to->si_trapno); -#endif -#ifdef __ia64__ - err |= __put_user(from->si_imm, &to->si_imm); - err |= __put_user(from->si_flags, &to->si_flags); - err |= __put_user(from->si_isr, &to->si_isr); -#endif - /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. - */ -#ifdef BUS_MCEERR_AR - if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR) - err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); -#endif -#ifdef BUS_MCEERR_AO - if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO) - err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); -#endif -#ifdef SEGV_BNDERR - if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) { - err |= __put_user(from->si_lower, &to->si_lower); - err |= __put_user(from->si_upper, &to->si_upper); - } -#endif -#ifdef SEGV_PKUERR - if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR) - err |= __put_user(from->si_pkey, &to->si_pkey); -#endif - break; - case SIL_CHLD: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_status, &to->si_status); - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - break; - case SIL_RT: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_ptr, &to->si_ptr); - break; - case SIL_SYS: - err |= __put_user(from->si_call_addr, &to->si_call_addr); - err |= __put_user(from->si_syscall, &to->si_syscall); - err |= __put_user(from->si_arch, &to->si_arch); - break; - } - return err; + return 0; } #ifdef CONFIG_COMPAT @@ -2984,27 +2902,28 @@ int __copy_siginfo_to_user32(struct compat_siginfo __user *to, #ifdef __ARCH_SI_TRAPNO new.si_trapno = from->si_trapno; #endif -#ifdef BUS_MCEERR_AR - if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR)) - new.si_addr_lsb = from->si_addr_lsb; -#endif -#ifdef BUS_MCEERR_AO - if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO)) - new.si_addr_lsb = from->si_addr_lsb; + break; + case SIL_FAULT_MCEERR: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; #endif -#ifdef SEGV_BNDERR - if ((from->si_signo == SIGSEGV) && - (from->si_code == SEGV_BNDERR)) { - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); - } + new.si_addr_lsb = from->si_addr_lsb; + break; + case SIL_FAULT_BNDERR: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; #endif -#ifdef SEGV_PKUERR - if ((from->si_signo == SIGSEGV) && - (from->si_code == SEGV_PKUERR)) - new.si_pkey = from->si_pkey; + new.si_lower = ptr_to_compat(from->si_lower); + new.si_upper = ptr_to_compat(from->si_upper); + break; + case SIL_FAULT_PKUERR: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; #endif - + new.si_pkey = from->si_pkey; break; case SIL_CHLD: new.si_pid = from->si_pid; @@ -3070,24 +2989,28 @@ int copy_siginfo_from_user32(struct siginfo *to, #ifdef __ARCH_SI_TRAPNO to->si_trapno = from.si_trapno; #endif -#ifdef BUS_MCEERR_AR - if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR)) - to->si_addr_lsb = from.si_addr_lsb; -#endif -#ifdef BUS_MCEER_AO - if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO)) - to->si_addr_lsb = from.si_addr_lsb; + break; + case SIL_FAULT_MCEERR: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; #endif -#ifdef SEGV_BNDERR - if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) { - to->si_lower = compat_ptr(from.si_lower); - to->si_upper = compat_ptr(from.si_upper); - } + to->si_addr_lsb = from.si_addr_lsb; + break; + case SIL_FAULT_BNDERR: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; #endif -#ifdef SEGV_PKUERR - if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR)) - to->si_pkey = from.si_pkey; + to->si_lower = compat_ptr(from.si_lower); + to->si_upper = compat_ptr(from.si_upper); + break; + case SIL_FAULT_PKUERR: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; #endif + to->si_pkey = from.si_pkey; break; case SIL_CHLD: to->si_pid = from.si_pid; diff --git a/kernel/softirq.c b/kernel/softirq.c index 177de3640c78..03981f1c39ea 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -145,8 +145,7 @@ static void __local_bh_enable(unsigned int cnt) } /* - * Special-case - softirqs can safely be enabled in - * cond_resched_softirq(), or by __do_softirq(), + * Special-case - softirqs can safely be enabled by __do_softirq(), * without processing still-pending softirqs: */ void _local_bh_enable(void) diff --git a/kernel/sys.c b/kernel/sys.c index b0eee418ee0d..d1b2b8d934bb 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -71,6 +71,9 @@ #include <asm/io.h> #include <asm/unistd.h> +/* Hardening for Spectre-v1 */ +#include <linux/nospec.h> + #include "uid16.h" #ifndef SET_UNALIGN_CTL @@ -1453,6 +1456,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); @@ -1472,6 +1476,7 @@ COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); r = current->signal->rlim[resource]; task_unlock(current->group_leader); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 9791364925dc..183169c2a75b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -43,7 +43,9 @@ COND_SYSCALL(io_submit); COND_SYSCALL_COMPAT(io_submit); COND_SYSCALL(io_cancel); COND_SYSCALL(io_getevents); +COND_SYSCALL(io_pgetevents); COND_SYSCALL_COMPAT(io_getevents); +COND_SYSCALL_COMPAT(io_pgetevents); /* fs/xattr.c */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 0ed768b56c60..675c4e9563a9 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -372,24 +372,12 @@ static const struct seq_operations timer_list_sops = { .show = timer_list_show, }; -static int timer_list_open(struct inode *inode, struct file *filp) -{ - return seq_open_private(filp, &timer_list_sops, - sizeof(struct timer_list_iter)); -} - -static const struct file_operations timer_list_fops = { - .open = timer_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - static int __init init_timer_list_procfs(void) { struct proc_dir_entry *pe; - pe = proc_create("timer_list", 0400, NULL, &timer_list_fops); + pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops, + sizeof(struct timer_list_iter), NULL); if (!pe) return -ENOMEM; return 0; diff --git a/kernel/torture.c b/kernel/torture.c index 37b94012a3f8..3de1efbecd6a 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -574,7 +574,7 @@ void stutter_wait(const char *title) { int spt; - cond_resched_rcu_qs(); + cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { if (spt == 1) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 414d7210b2ec..bcd93031d042 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -893,7 +893,7 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -static void tracing_snapshot_instance(struct trace_array *tr) +void tracing_snapshot_instance(struct trace_array *tr) { struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -949,7 +949,7 @@ static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, struct trace_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); -static int alloc_snapshot(struct trace_array *tr) +int tracing_alloc_snapshot_instance(struct trace_array *tr) { int ret; @@ -995,7 +995,7 @@ int tracing_alloc_snapshot(void) struct trace_array *tr = &global_trace; int ret; - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); WARN_ON(ret < 0); return ret; @@ -5408,7 +5408,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; } @@ -6451,7 +6451,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, } #endif if (!tr->allocated_snapshot) { - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) break; } @@ -7179,7 +7179,7 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, return ret; out_reg: - ret = alloc_snapshot(tr); + ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6fb46a06c9dc..507954b4e058 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1817,6 +1817,17 @@ static inline void __init trace_event_init(void) { } static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } #endif +#ifdef CONFIG_TRACER_SNAPSHOT +void tracing_snapshot_instance(struct trace_array *tr); +int tracing_alloc_snapshot_instance(struct trace_array *tr); +#else +static inline void tracing_snapshot_instance(struct trace_array *tr) { } +static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) +{ + return 0; +} +#endif + extern struct trace_iterator *tracepoint_print_iter; #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 22fee766081b..80e0b2aca703 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -159,13 +159,13 @@ static int benchmark_event_kthread(void *arg) * wants to run, schedule in, but if the CPU is idle, * we'll keep burning cycles. * - * Note the _rcu_qs() version of cond_resched() will + * Note the tasks_rcu_qs() version of cond_resched() will * notify synchronize_rcu_tasks() that this thread has * passed a quiescent state for rcu_tasks. Otherwise * this thread will never voluntarily schedule which would * block synchronize_rcu_tasks() indefinitely. */ - cond_resched(); + cond_resched_tasks_rcu_qs(); } return 0; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d251cabcf69a..8b5bdcf64871 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -483,9 +483,10 @@ clear_event_triggers(struct trace_array *tr) struct trace_event_file *file; list_for_each_entry(file, &tr->events, list) { - struct event_trigger_data *data; - list_for_each_entry_rcu(data, &file->triggers, list) { + struct event_trigger_data *data, *n; + list_for_each_entry_safe(data, n, &file->triggers, list) { trace_event_trigger_enable_disable(file, 0); + list_del_rcu(&data->list); if (data->ops->free) data->ops->free(data->ops, data); } @@ -642,6 +643,7 @@ event_trigger_callback(struct event_command *cmd_ops, trigger_data->count = -1; trigger_data->ops = trigger_ops; trigger_data->cmd_ops = cmd_ops; + trigger_data->private_data = file; INIT_LIST_HEAD(&trigger_data->list); INIT_LIST_HEAD(&trigger_data->named_list); @@ -1053,7 +1055,12 @@ static void snapshot_trigger(struct event_trigger_data *data, void *rec, struct ring_buffer_event *event) { - tracing_snapshot(); + struct trace_event_file *file = data->private_data; + + if (file) + tracing_snapshot_instance(file->tr); + else + tracing_snapshot(); } static void @@ -1076,7 +1083,7 @@ register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, { int ret = register_trigger(glob, ops, data, file); - if (ret > 0 && tracing_alloc_snapshot() != 0) { + if (ret > 0 && tracing_alloc_snapshot_instance(file->tr) != 0) { unregister_trigger(glob, ops, data, file); ret = 0; } |