diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/syscall.c | 37 | ||||
-rw-r--r-- | kernel/debug/debug_core.c | 4 | ||||
-rw-r--r-- | kernel/debug/gdbstub.c | 13 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_io.c | 72 | ||||
-rw-r--r-- | kernel/dma/Kconfig | 3 | ||||
-rw-r--r-- | kernel/dma/direct.c | 61 | ||||
-rw-r--r-- | kernel/dma/pool.c | 87 | ||||
-rw-r--r-- | kernel/dma/remap.c | 5 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 2 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/irq/manage.c | 37 | ||||
-rw-r--r-- | kernel/kallsyms.c | 17 | ||||
-rw-r--r-- | kernel/kexec_file.c | 34 | ||||
-rw-r--r-- | kernel/kprobes.c | 4 | ||||
-rw-r--r-- | kernel/module.c | 55 | ||||
-rw-r--r-- | kernel/padata.c | 4 | ||||
-rw-r--r-- | kernel/rcu/rcuperf.c | 2 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 32 | ||||
-rw-r--r-- | kernel/sched/core.c | 118 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 1 | ||||
-rw-r--r-- | kernel/sched/fair.c | 17 | ||||
-rw-r--r-- | kernel/sched/idle.c | 15 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 | ||||
-rw-r--r-- | kernel/signal.c | 10 | ||||
-rw-r--r-- | kernel/smp.c | 18 | ||||
-rw-r--r-- | kernel/task_work.c | 16 | ||||
-rw-r--r-- | kernel/time/timer.c | 21 |
27 files changed, 447 insertions, 242 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a74fce8ce043..0fd80ac81f70 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3137,7 +3137,8 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, return NULL; } -static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, + const struct cred *f_cred) { const struct bpf_map *map; struct bpf_insn *insns; @@ -3163,7 +3164,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) code == (BPF_JMP | BPF_CALL_ARGS)) { if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; - if (!bpf_dump_raw_ok()) + if (!bpf_dump_raw_ok(f_cred)) insns[i].imm = 0; continue; } @@ -3219,7 +3220,8 @@ static int set_info_rec_size(struct bpf_prog_info *info) return 0; } -static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, +static int bpf_prog_get_info_by_fd(struct file *file, + struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3288,11 +3290,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok()) { + if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { info.xlated_prog_insns = 0; goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog); + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); if (!insns_sanitized) return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); @@ -3326,7 +3328,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); @@ -3361,7 +3363,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { unsigned long ksym_addr; u64 __user *user_ksyms; u32 i; @@ -3392,7 +3394,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; @@ -3449,7 +3451,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, else info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { __u64 __user *user_linfo; u32 i; @@ -3495,7 +3497,8 @@ done: return 0; } -static int bpf_map_get_info_by_fd(struct bpf_map *map, +static int bpf_map_get_info_by_fd(struct file *file, + struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3538,7 +3541,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } -static int bpf_btf_get_info_by_fd(struct btf *btf, +static int bpf_btf_get_info_by_fd(struct file *file, + struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3553,7 +3557,8 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } -static int bpf_link_get_info_by_fd(struct bpf_link *link, +static int bpf_link_get_info_by_fd(struct file *file, + struct bpf_link *link, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3606,15 +3611,15 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return -EBADFD; if (f.file->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(f.file->private_data, attr, + err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_link_fops) - err = bpf_link_get_info_by_fd(f.file->private_data, + err = bpf_link_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else err = -EINVAL; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index bc8d25f2ac8a..9e5934780f41 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -587,6 +587,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, arch_kgdb_ops.disable_hw_break(regs); acquirelock: + rcu_read_lock(); /* * Interrupts will be restored by the 'trap return' code, except when * single stepping. @@ -646,6 +647,7 @@ return_normal: atomic_dec(&slaves_in_kgdb); dbg_touch_watchdogs(); local_irq_restore(flags); + rcu_read_unlock(); return 0; } cpu_relax(); @@ -664,6 +666,7 @@ return_normal: raw_spin_unlock(&dbg_master_lock); dbg_touch_watchdogs(); local_irq_restore(flags); + rcu_read_unlock(); goto acquirelock; } @@ -787,6 +790,7 @@ kgdb_restore: raw_spin_unlock(&dbg_master_lock); dbg_touch_watchdogs(); local_irq_restore(flags); + rcu_read_unlock(); return kgdb_info[cpu].ret_state; } diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 61774aec46b4..a790026e42d0 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -792,6 +792,19 @@ static void gdb_cmd_query(struct kgdb_state *ks) } break; #endif +#ifdef CONFIG_HAVE_ARCH_KGDB_QXFER_PKT + case 'S': + if (!strncmp(remcom_in_buffer, "qSupported:", 11)) + strcpy(remcom_out_buffer, kgdb_arch_gdb_stub_feature); + break; + case 'X': + if (!strncmp(remcom_in_buffer, "qXfer:", 6)) + kgdb_arch_handle_qxfer_pkt(remcom_in_buffer, + remcom_out_buffer); + break; +#endif + default: + break; } } diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 924bc9298a42..683a799618ad 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -542,6 +542,44 @@ static int kdb_search_string(char *searched, char *searchfor) return 0; } +static void kdb_msg_write(const char *msg, int msg_len) +{ + struct console *c; + + if (msg_len == 0) + return; + + if (dbg_io_ops) { + const char *cp = msg; + int len = msg_len; + + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; + } + } + + for_each_console(c) { + if (!(c->flags & CON_ENABLED)) + continue; + if (c == dbg_io_ops->cons) + continue; + /* + * Set oops_in_progress to encourage the console drivers to + * disregard their internal spin locks: in the current calling + * context the risk of deadlock is a bigger problem than risks + * due to re-entering the console driver. We operate directly on + * oops_in_progress rather than using bust_spinlocks() because + * the calls bust_spinlocks() makes on exit are not appropriate + * for this calling context. + */ + ++oops_in_progress; + c->write(c, msg, msg_len); + --oops_in_progress; + touch_nmi_watchdog(); + } +} + int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) { int diag; @@ -553,7 +591,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int this_cpu, old_cpu; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; - struct console *c; unsigned long uninitialized_var(flags); /* Serialize kdb_printf if multiple cpus try to write at once. @@ -687,22 +724,11 @@ kdb_printit: */ retlen = strlen(kdb_buffer); cp = (char *) printk_skip_headers(kdb_buffer); - if (!dbg_kdb_mode && kgdb_connected) { + if (!dbg_kdb_mode && kgdb_connected) gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); - } else { - if (dbg_io_ops && !dbg_io_ops->is_console) { - len = retlen - (cp - kdb_buffer); - cp2 = cp; - while (len--) { - dbg_io_ops->write_char(*cp2); - cp2++; - } - } - for_each_console(c) { - c->write(c, cp, retlen - (cp - kdb_buffer)); - touch_nmi_watchdog(); - } - } + else + kdb_msg_write(cp, retlen - (cp - kdb_buffer)); + if (logging) { saved_loglevel = console_loglevel; console_loglevel = CONSOLE_LOGLEVEL_SILENT; @@ -751,19 +777,7 @@ kdb_printit: moreprompt = "more> "; kdb_input_flush(); - - if (dbg_io_ops && !dbg_io_ops->is_console) { - len = strlen(moreprompt); - cp = moreprompt; - while (len--) { - dbg_io_ops->write_char(*cp); - cp++; - } - } - for_each_console(c) { - c->write(c, moreprompt, strlen(moreprompt)); - touch_nmi_watchdog(); - } + kdb_msg_write(moreprompt, strlen(moreprompt)); if (logging) printk("%s", moreprompt); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index a0ce3c1494fd..1da3f44f2565 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -71,15 +71,16 @@ config SWIOTLB # in the pagetables # config DMA_NONCOHERENT_MMAP + default y if !MMU bool config DMA_COHERENT_POOL + select GENERIC_ALLOCATOR bool config DMA_REMAP bool depends on MMU - select GENERIC_ALLOCATOR select DMA_NONCOHERENT_MMAP config DMA_DIRECT_REMAP diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index ecb922a0bfa0..67f060b86a73 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -70,7 +70,7 @@ gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, return 0; } -static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); @@ -109,14 +109,15 @@ static inline bool dma_should_free_from_pool(struct device *dev, return false; } -struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, +static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, unsigned long attrs) { - size_t alloc_size = PAGE_ALIGN(size); int node = dev_to_node(dev); struct page *page = NULL; u64 phys_limit; + WARN_ON_ONCE(!PAGE_ALIGNED(size)); + if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; @@ -124,14 +125,14 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp &= ~__GFP_ZERO; gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); - page = dma_alloc_contiguous(dev, alloc_size, gfp); + page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_free_contiguous(dev, page, alloc_size); + dma_free_contiguous(dev, page, size); page = NULL; } again: if (!page) - page = alloc_pages_node(node, gfp, get_order(alloc_size)); + page = alloc_pages_node(node, gfp, get_order(size)); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); page = NULL; @@ -157,9 +158,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, { struct page *page; void *ret; + int err; + + size = PAGE_ALIGN(size); if (dma_should_alloc_from_pool(dev, gfp, attrs)) { - ret = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &page, gfp); + ret = dma_alloc_from_pool(dev, size, &page, gfp); if (!ret) return NULL; goto done; @@ -183,14 +187,20 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, dma_alloc_need_uncached(dev, attrs)) || (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { /* remove any dirty cache lines on the kernel alias */ - arch_dma_prep_coherent(page, PAGE_ALIGN(size)); + arch_dma_prep_coherent(page, size); /* create a coherent mapping */ - ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size), + ret = dma_common_contiguous_remap(page, size, dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) goto out_free_pages; + if (force_dma_unencrypted(dev)) { + err = set_memory_decrypted((unsigned long)ret, + 1 << get_order(size)); + if (err) + goto out_free_pages; + } memset(ret, 0, size); goto done; } @@ -207,8 +217,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted(dev)) - set_memory_decrypted((unsigned long)ret, 1 << get_order(size)); + if (force_dma_unencrypted(dev)) { + err = set_memory_decrypted((unsigned long)ret, + 1 << get_order(size)); + if (err) + goto out_free_pages; + } memset(ret, 0, size); @@ -217,7 +231,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) - goto out_free_pages; + goto out_encrypt_pages; } done: if (force_dma_unencrypted(dev)) @@ -225,6 +239,15 @@ done: else *dma_handle = phys_to_dma(dev, page_to_phys(page)); return ret; + +out_encrypt_pages: + if (force_dma_unencrypted(dev)) { + err = set_memory_encrypted((unsigned long)page_address(page), + 1 << get_order(size)); + /* If memory cannot be re-encrypted, it must be leaked */ + if (err) + return NULL; + } out_free_pages: dma_free_contiguous(dev, page, size); return NULL; @@ -459,7 +482,6 @@ int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, return ret; } -#ifdef CONFIG_MMU bool dma_direct_can_mmap(struct device *dev) { return dev_is_dma_coherent(dev) || @@ -485,19 +507,6 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, user_count << PAGE_SHIFT, vma->vm_page_prot); } -#else /* CONFIG_MMU */ -bool dma_direct_can_mmap(struct device *dev) -{ - return false; -} - -int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - return -ENXIO; -} -#endif /* CONFIG_MMU */ int dma_direct_supported(struct device *dev, u64 mask) { diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 8cfa01243ed2..6bc74a2d5127 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -6,7 +6,6 @@ #include <linux/debugfs.h> #include <linux/dma-direct.h> #include <linux/dma-noncoherent.h> -#include <linux/dma-contiguous.h> #include <linux/init.h> #include <linux/genalloc.h> #include <linux/set_memory.h> @@ -69,12 +68,7 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, do { pool_size = 1 << (PAGE_SHIFT + order); - - if (dev_get_cma_area(NULL)) - page = dma_alloc_from_contiguous(NULL, 1 << order, - order, false); - else - page = alloc_pages(gfp, order); + page = alloc_pages(gfp, order); } while (!page && order-- > 0); if (!page) goto out; @@ -118,8 +112,7 @@ remove_mapping: dma_common_free_remap(addr, pool_size); #endif free_page: __maybe_unused - if (!dma_release_from_contiguous(NULL, page, 1 << order)) - __free_pages(page, order); + __free_pages(page, order); out: return ret; } @@ -203,7 +196,7 @@ static int __init dma_atomic_pool_init(void) } postcore_initcall(dma_atomic_pool_init); -static inline struct gen_pool *dev_to_pool(struct device *dev) +static inline struct gen_pool *dma_guess_pool_from_device(struct device *dev) { u64 phys_mask; gfp_t gfp; @@ -217,47 +210,79 @@ static inline struct gen_pool *dev_to_pool(struct device *dev) return atomic_pool_kernel; } -static bool dma_in_atomic_pool(struct device *dev, void *start, size_t size) +static inline struct gen_pool *dma_get_safer_pool(struct gen_pool *bad_pool) +{ + if (bad_pool == atomic_pool_kernel) + return atomic_pool_dma32 ? : atomic_pool_dma; + + if (bad_pool == atomic_pool_dma32) + return atomic_pool_dma; + + return NULL; +} + +static inline struct gen_pool *dma_guess_pool(struct device *dev, + struct gen_pool *bad_pool) { - struct gen_pool *pool = dev_to_pool(dev); + if (bad_pool) + return dma_get_safer_pool(bad_pool); - if (unlikely(!pool)) - return false; - return gen_pool_has_addr(pool, (unsigned long)start, size); + return dma_guess_pool_from_device(dev); } void *dma_alloc_from_pool(struct device *dev, size_t size, struct page **ret_page, gfp_t flags) { - struct gen_pool *pool = dev_to_pool(dev); - unsigned long val; + struct gen_pool *pool = NULL; + unsigned long val = 0; void *ptr = NULL; - - if (!pool) { - WARN(1, "%pGg atomic pool not initialised!\n", &flags); - return NULL; + phys_addr_t phys; + + while (1) { + pool = dma_guess_pool(dev, pool); + if (!pool) { + WARN(1, "Failed to get suitable pool for %s\n", + dev_name(dev)); + break; + } + + val = gen_pool_alloc(pool, size); + if (!val) + continue; + + phys = gen_pool_virt_to_phys(pool, val); + if (dma_coherent_ok(dev, phys, size)) + break; + + gen_pool_free(pool, val, size); + val = 0; } - val = gen_pool_alloc(pool, size); - if (val) { - phys_addr_t phys = gen_pool_virt_to_phys(pool, val); + if (val) { *ret_page = pfn_to_page(__phys_to_pfn(phys)); ptr = (void *)val; memset(ptr, 0, size); + + if (gen_pool_avail(pool) < atomic_pool_size) + schedule_work(&atomic_pool_work); } - if (gen_pool_avail(pool) < atomic_pool_size) - schedule_work(&atomic_pool_work); return ptr; } bool dma_free_from_pool(struct device *dev, void *start, size_t size) { - struct gen_pool *pool = dev_to_pool(dev); + struct gen_pool *pool = NULL; + + while (1) { + pool = dma_guess_pool(dev, pool); + if (!pool) + return false; - if (!dma_in_atomic_pool(dev, start, size)) - return false; - gen_pool_free(pool, (unsigned long)start, size); - return true; + if (gen_pool_has_addr(pool, (unsigned long)start, size)) { + gen_pool_free(pool, (unsigned long)start, size); + return true; + } + } } diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index e739a6eea6e7..78b23f089cf1 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -24,7 +24,8 @@ void *dma_common_pages_remap(struct page **pages, size_t size, { void *vaddr; - vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot); + vaddr = vmap(pages, PAGE_ALIGN(size) >> PAGE_SHIFT, + VM_DMA_COHERENT, prot); if (vaddr) find_vm_area(vaddr)->pages = pages; return vaddr; @@ -37,7 +38,7 @@ void *dma_common_pages_remap(struct page **pages, size_t size, void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, const void *caller) { - int count = size >> PAGE_SHIFT; + int count = PAGE_ALIGN(size) >> PAGE_SHIFT; struct page **pages; void *vaddr; int i; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb0862873dba..5f8b0c52fd2e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2199,7 +2199,7 @@ static void handle_swbp(struct pt_regs *regs) if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ - send_sig(SIGTRAP, current, 0); + force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..efc5493203ae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1977,7 +1977,7 @@ static __latent_entropy struct task_struct *copy_process( * to stop root fork bombs. */ retval = -EAGAIN; - if (nr_threads >= max_threads) + if (data_race(nr_threads >= max_threads)) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 761911168438..2a9fec53e159 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -195,9 +195,9 @@ void irq_set_thread_affinity(struct irq_desc *desc) set_bit(IRQTF_AFFINITY, &action->thread_flags); } +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static void irq_validate_effective_affinity(struct irq_data *data) { -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK const struct cpumask *m = irq_data_get_effective_affinity_mask(data); struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -205,9 +205,19 @@ static void irq_validate_effective_affinity(struct irq_data *data) return; pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", chip->name, data->irq); -#endif } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) +{ + cpumask_copy(irq_data_get_effective_affinity_mask(data), mask); +} +#else +static inline void irq_validate_effective_affinity(struct irq_data *data) { } +static inline void irq_init_effective_affinity(struct irq_data *data, + const struct cpumask *mask) { } +#endif + int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -304,6 +314,26 @@ static int irq_try_set_affinity(struct irq_data *data, return ret; } +static bool irq_set_affinity_deactivated(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + /* + * If the interrupt is not yet activated, just store the affinity + * mask and do not call the chip driver at all. On activation the + * driver has to make sure anyway that the interrupt is in a + * useable state so startup works. + */ + if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data)) + return false; + + cpumask_copy(desc->irq_common_data.affinity, mask); + irq_init_effective_affinity(data, mask); + irqd_set(data, IRQD_AFFINITY_SET); + return true; +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -314,6 +344,9 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; + if (irq_set_affinity_deactivated(data, mask, force)) + return 0; + if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { ret = irq_try_set_affinity(data, mask, force); } else { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..bb14e64f62a4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -644,19 +644,20 @@ static inline int kallsyms_for_perf(void) * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ -int kallsyms_show_value(void) +bool kallsyms_show_value(const struct cred *cred) { switch (kptr_restrict) { case 0: if (kallsyms_for_perf()) - return 1; + return true; /* fallthrough */ case 1: - if (has_capability_noaudit(current, CAP_SYSLOG)) - return 1; + if (security_capable(cred, &init_user_ns, CAP_SYSLOG, + CAP_OPT_NOAUDIT) == 0) + return true; /* fallthrough */ default: - return 0; + return false; } } @@ -673,7 +674,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return -ENOMEM; reset_iter(iter, 0); - iter->show_value = kallsyms_show_value(); + /* + * Instead of checking this on every s_show() call, cache + * the result here at open time. + */ + iter->show_value = kallsyms_show_value(file->f_cred); return 0; } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bb05fd52de85..09cc78df53c6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -181,34 +181,19 @@ void kimage_file_post_load_cleanup(struct kimage *image) static int kimage_validate_signature(struct kimage *image) { - const char *reason; int ret; ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, image->kernel_buf_len); - switch (ret) { - case 0: - break; + if (ret) { - /* Certain verification errors are non-fatal if we're not - * checking errors, provided we aren't mandating that there - * must be a valid signature. - */ - case -ENODATA: - reason = "kexec of unsigned image"; - goto decide; - case -ENOPKG: - reason = "kexec of image with unsupported crypto"; - goto decide; - case -ENOKEY: - reason = "kexec of image with unavailable key"; - decide: if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { - pr_notice("%s rejected\n", reason); + pr_notice("Enforced kernel signature verification failed (%d).\n", ret); return ret; } - /* If IMA is guaranteed to appraise a signature on the kexec + /* + * If IMA is guaranteed to appraise a signature on the kexec * image, permit it even if the kernel is otherwise locked * down. */ @@ -216,17 +201,10 @@ kimage_validate_signature(struct kimage *image) security_locked_down(LOCKDOWN_KEXEC)) return -EPERM; - return 0; - - /* All other errors are fatal, including nomem, unparseable - * signatures and signature check failures - even if signatures - * aren't required. - */ - default: - pr_notice("kernel signature verification failed (%d).\n", ret); + pr_debug("kernel signature verification failed (%d).\n", ret); } - return ret; + return 0; } #endif diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4a904cc56d68..2e97febeef77 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2448,7 +2448,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; - if (!kallsyms_show_value()) + if (!kallsyms_show_value(pi->file->f_cred)) addr = NULL; if (sym) @@ -2540,7 +2540,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) * If /proc/kallsyms is not showing kernel address, we won't * show them here either. */ - if (!kallsyms_show_value()) + if (!kallsyms_show_value(m->file->f_cred)) seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, (void *)ent->start_addr); else diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..aa183c9ac0a2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1510,8 +1510,7 @@ static inline bool sect_empty(const Elf_Shdr *sect) } struct module_sect_attr { - struct module_attribute mattr; - char *name; + struct bin_attribute battr; unsigned long address; }; @@ -1521,13 +1520,18 @@ struct module_sect_attrs { struct module_sect_attr attrs[]; }; -static ssize_t module_sect_show(struct module_attribute *mattr, - struct module_kobject *mk, char *buf) +static ssize_t module_sect_read(struct file *file, struct kobject *kobj, + struct bin_attribute *battr, + char *buf, loff_t pos, size_t count) { struct module_sect_attr *sattr = - container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%px\n", kptr_restrict < 2 ? - (void *)sattr->address : NULL); + container_of(battr, struct module_sect_attr, battr); + + if (pos != 0) + return -EINVAL; + + return sprintf(buf, "0x%px\n", + kallsyms_show_value(file->f_cred) ? (void *)sattr->address : NULL); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -1535,7 +1539,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) unsigned int section; for (section = 0; section < sect_attrs->nsections; section++) - kfree(sect_attrs->attrs[section].name); + kfree(sect_attrs->attrs[section].battr.attr.name); kfree(sect_attrs); } @@ -1544,42 +1548,41 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info) unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; struct module_sect_attr *sattr; - struct attribute **gattr; + struct bin_attribute **gattr; /* Count loaded sections and allocate structures */ for (i = 0; i < info->hdr->e_shnum; i++) if (!sect_empty(&info->sechdrs[i])) nloaded++; size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded), - sizeof(sect_attrs->grp.attrs[0])); - size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); + sizeof(sect_attrs->grp.bin_attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]); sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); if (sect_attrs == NULL) return; /* Setup section attributes. */ sect_attrs->grp.name = "sections"; - sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; + sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0]; sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; - gattr = §_attrs->grp.attrs[0]; + gattr = §_attrs->grp.bin_attrs[0]; for (i = 0; i < info->hdr->e_shnum; i++) { Elf_Shdr *sec = &info->sechdrs[i]; if (sect_empty(sec)) continue; + sysfs_bin_attr_init(&sattr->battr); sattr->address = sec->sh_addr; - sattr->name = kstrdup(info->secstrings + sec->sh_name, - GFP_KERNEL); - if (sattr->name == NULL) + sattr->battr.attr.name = + kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); + if (sattr->battr.attr.name == NULL) goto out; sect_attrs->nsections++; - sysfs_attr_init(&sattr->mattr.attr); - sattr->mattr.show = module_sect_show; - sattr->mattr.store = NULL; - sattr->mattr.attr.name = sattr->name; - sattr->mattr.attr.mode = S_IRUSR; - *(gattr++) = &(sattr++)->mattr.attr; + sattr->battr.read = module_sect_read; + sattr->battr.size = 3 /* "0x", "\n" */ + (BITS_PER_LONG / 4); + sattr->battr.attr.mode = 0400; + *(gattr++) = &(sattr++)->battr; } *gattr = NULL; @@ -1669,7 +1672,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) continue; if (info->sechdrs[i].sh_type == SHT_NOTE) { sysfs_bin_attr_init(nattr); - nattr->attr.name = mod->sect_attrs->attrs[loaded].name; + nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name; nattr->attr.mode = S_IRUGO; nattr->size = info->sechdrs[i].sh_size; nattr->private = (void *) info->sechdrs[i].sh_addr; @@ -2783,7 +2786,9 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return vmalloc_exec(size); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); } bool __weak module_init_section(const char *name) @@ -4377,7 +4382,7 @@ static int modules_open(struct inode *inode, struct file *file) if (!err) { struct seq_file *m = file->private_data; - m->private = kallsyms_show_value() ? NULL : (void *)8ul; + m->private = kallsyms_show_value(file->f_cred) ? NULL : (void *)8ul; } return err; diff --git a/kernel/padata.c b/kernel/padata.c index 29fc5d87a4cd..4373f7adaa40 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -335,7 +335,7 @@ static void padata_reorder(struct parallel_data *pd) * * Ensure reorder queue is read after pd->lock is dropped so we see * new objects from another task in padata_do_serial. Pairs with - * smp_mb__after_atomic in padata_do_serial. + * smp_mb in padata_do_serial. */ smp_mb(); @@ -418,7 +418,7 @@ void padata_do_serial(struct padata_priv *padata) * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb * in padata_reorder. */ - smp_mb__after_atomic(); + smp_mb(); padata_reorder(pd); } diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 16dd1e6b7c09..9eb39c20082c 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -723,7 +723,7 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } - pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj)); + pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj)); kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c716eadc7617..6c6569e0586c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -250,7 +250,7 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); @@ -274,13 +274,13 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { - atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); + arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ } } @@ -313,7 +313,7 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); } /* @@ -633,6 +633,10 @@ static noinstr void rcu_eqs_enter(bool user) do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... @@ -692,6 +696,7 @@ noinstr void rcu_nmi_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + instrumentation_begin(); /* * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. * (We are exiting an NMI handler, so RCU better be paying attention @@ -705,7 +710,6 @@ noinstr void rcu_nmi_exit(void) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { - instrumentation_begin(); trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ @@ -714,13 +718,15 @@ noinstr void rcu_nmi_exit(void) return; } - instrumentation_begin(); /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (!in_nmi()) rcu_prepare_for_idle(); + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); instrumentation_end(); // RCU is watching here ... @@ -838,6 +844,10 @@ static void noinstr rcu_eqs_exit(bool user) rcu_dynticks_eqs_exit(); // ... but is watching here. instrumentation_begin(); + + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); @@ -983,13 +993,21 @@ noinstr void rcu_nmi_enter(void) if (!in_nmi()) rcu_cleanup_after_idle(); + instrumentation_begin(); + // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() + instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + incby = 1; } else if (!in_nmi()) { instrumentation_begin(); rcu_irq_enter_check_tick(); instrumentation_end(); + } else { + instrumentation_begin(); } - instrumentation_begin(); + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f360326861e..2142c6767682 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1311,9 +1311,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1323,9 +1320,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(rq, p, flags); } @@ -1637,7 +1631,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(p->cpus_ptr, new_mask)) + if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; /* @@ -2236,10 +2230,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, lockdep_assert_held(&rq->lock); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; #endif @@ -2293,8 +2287,15 @@ void sched_ttwu_pending(void *arg) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { + if (WARN_ON_ONCE(p->on_cpu)) + smp_cond_load_acquire(&p->on_cpu, !VAL); + + if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) + set_task_cpu(p, cpu_of(rq)); + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); + } rq_unlock_irqrestore(rq, &rf); } @@ -2322,7 +2323,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); - __smp_call_single_queue(cpu, &p->wake_entry); + __smp_call_single_queue(cpu, &p->wake_entry.llist); } void wake_up_if_idle(int cpu) @@ -2369,7 +2370,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) * the soon-to-be-idle CPU as the current CPU is likely busy. * nr_running is checked to avoid unnecessary task stacking. */ - if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1) + if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) return true; return false; @@ -2378,6 +2379,9 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { + if (WARN_ON_ONCE(cpu == smp_processor_id())) + return false; + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -2528,7 +2532,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) goto out; success = 1; - cpu = task_cpu(p); trace_sched_waking(p); p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -2550,7 +2553,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* We're going to change ->state: */ success = 1; - cpu = task_cpu(p); /* * Ensure we load p->on_rq _after_ p->state, otherwise it would @@ -2575,7 +2577,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) goto unlock; if (p->in_iowait) { @@ -2584,9 +2586,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #ifdef CONFIG_SMP - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2605,8 +2604,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + p->state = TASK_WAKING; /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -2614,8 +2625,21 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * which potentially sends an IPI instead of spinning on p->on_cpu to * let the waker make forward progress. This is safe because IRQs are * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. */ - if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ)) + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) goto unlock; /* @@ -2635,6 +2659,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } +#else + cpu = task_cpu(p); #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2642,7 +2668,7 @@ unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); out: if (success) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, task_cpu(p), wake_flags); preempt_enable(); return success; @@ -2763,7 +2789,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP - p->wake_entry_type = CSD_TYPE_TTWU; + p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif } @@ -2939,6 +2965,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); + rseq_migrate(p); /* * We're setting the CPU for the first time, we don't migrate, * so use __set_task_cpu(). @@ -3003,6 +3030,7 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); + rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); @@ -4074,6 +4102,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4093,9 +4122,16 @@ static void __sched notrace __schedule(bool preempt) /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): * - * The membarrier system call requires a full memory barrier + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) + * + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4106,10 +4142,38 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (signal_pending_state(prev->state, prev)) { + + /* + * We must load prev->state once (task_struct::state is volatile), such + * that: + * + * - we form a control dependency vs deactivate_task() below. + * - ptrace_{,un}freeze_traced() can change ->state underneath us. + */ + prev_state = prev->state; + if (!preempt && prev_state) { + if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (p->on_rq && ...) + * if (prev_state) goto out; + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); + * p->state = TASK_WAKING + * + * Where __schedule() and ttwu() have matching control dependencies. + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); if (prev->in_iowait) { @@ -4421,6 +4485,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -4533,7 +4598,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) */ if (dl_prio(prio)) { if (!dl_prio(p->normal_prio) || - (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; queue_flag |= ENQUEUE_REPLENISH; } else diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 504d2f51b0d6..f63f337c7147 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2692,6 +2692,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; + dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cbcb2f71599b..04fa8dbcfa4d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct task_struct *p) } } - sa->runnable_avg = cpu_scale; + sa->runnable_avg = sa->util_avg; if (p->sched_class != &fair_sched_class) { /* @@ -4039,7 +4039,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - rq->misfit_task_load = task_h_load(p); + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } #else /* CONFIG_SMP */ @@ -7638,7 +7642,14 @@ static int detach_tasks(struct lb_env *env) switch (env->migration_type) { case migrate_load: - load = task_h_load(p); + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 05deb81bb3e3..1ae95b9150d3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -96,6 +96,15 @@ void __cpuidle default_idle_call(void) } } +static int call_cpuidle_s2idle(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + if (current_clr_polling_and_test()) + return -EBUSY; + + return cpuidle_enter_s2idle(drv, dev); +} + static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, int next_state) { @@ -171,11 +180,9 @@ static void cpuidle_idle_call(void) if (idle_should_enter_s2idle()) { rcu_idle_enter(); - entered_state = cpuidle_enter_s2idle(drv, dev); - if (entered_state > 0) { - local_irq_enable(); + entered_state = call_cpuidle_s2idle(drv, dev); + if (entered_state > 0) goto exit_idle; - } rcu_idle_exit(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d4e94c1e5fe..877fb08eb1b0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1682,7 +1682,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* Child wakeup after fork */ #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ -#define WF_ON_RQ 0x08 /* Wakee is on_rq */ +#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution diff --git a/kernel/signal.c b/kernel/signal.c index 5ca48cc5da76..ee22ec78fd6d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2529,9 +2529,6 @@ bool get_signal(struct ksignal *ksig) struct signal_struct *signal = current->signal; int signr; - if (unlikely(current->task_works)) - task_work_run(); - if (unlikely(uprobe_deny_signal())) return false; @@ -2544,6 +2541,13 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + if (unlikely(current->task_works)) { + spin_unlock_irq(&sighand->siglock); + task_work_run(); + goto relock; + } + /* * Every stopped thread goes here after wakeup. Check to see if * we should notify the parent, prepare_signal(SIGCONT) encodes diff --git a/kernel/smp.c b/kernel/smp.c index 472c2b274c65..aa17eedff5be 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,24 +669,6 @@ void __init smp_init(void) { int num_nodes, num_cpus; - /* - * Ensure struct irq_work layout matches so that - * flush_smp_call_function_queue() can do horrible things. - */ - BUILD_BUG_ON(offsetof(struct irq_work, llnode) != - offsetof(struct __call_single_data, llist)); - BUILD_BUG_ON(offsetof(struct irq_work, func) != - offsetof(struct __call_single_data, func)); - BUILD_BUG_ON(offsetof(struct irq_work, flags) != - offsetof(struct __call_single_data, flags)); - - /* - * Assert the CSD_TYPE_TTWU layout is similar enough - * for task_struct to be on the @call_single_queue. - */ - BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != - offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); - idle_threads_init(); cpuhp_threads_init(); diff --git a/kernel/task_work.c b/kernel/task_work.c index 825f28259a19..5c0848ca1287 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -25,9 +25,10 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * 0 if succeeds or -ESRCH. */ int -task_work_add(struct task_struct *task, struct callback_head *work, bool notify) +task_work_add(struct task_struct *task, struct callback_head *work, int notify) { struct callback_head *head; + unsigned long flags; do { head = READ_ONCE(task->task_works); @@ -36,8 +37,19 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) work->next = head; } while (cmpxchg(&task->task_works, head, work) != head); - if (notify) + switch (notify) { + case TWA_RESUME: set_notify_resume(task); + break; + case TWA_SIGNAL: + if (lock_task_sighand(task, &flags)) { + task->jobctl |= JOBCTL_TASK_WORK; + signal_wake_up(task, 0); + unlock_task_sighand(task, &flags); + } + break; + } + return 0; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 398e6eadb861..df1ff803acc4 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -521,8 +521,8 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk) * Force expire obscene large timeouts to expire at the * capacity limit of the wheel. */ - if (expires >= WHEEL_TIMEOUT_CUTOFF) - expires = WHEEL_TIMEOUT_MAX; + if (delta >= WHEEL_TIMEOUT_CUTOFF) + expires = clk + WHEEL_TIMEOUT_MAX; idx = calc_index(expires, LVL_DEPTH - 1); } @@ -584,7 +584,15 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) * Set the next expiry time and kick the CPU so it can reevaluate the * wheel: */ - base->next_expiry = timer->expires; + if (time_before(timer->expires, base->clk)) { + /* + * Prevent from forward_timer_base() moving the base->clk + * backward + */ + base->next_expiry = base->clk; + } else { + base->next_expiry = timer->expires; + } wake_up_nohz_cpu(base->cpu); } @@ -896,10 +904,13 @@ static inline void forward_timer_base(struct timer_base *base) * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ - if (time_after(base->next_expiry, jnow)) + if (time_after(base->next_expiry, jnow)) { base->clk = jnow; - else + } else { + if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) + return; base->clk = base->next_expiry; + } #endif } |