diff options
Diffstat (limited to 'kernel')
38 files changed, 481 insertions, 632 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4cb5441ad75f..d8520095ca03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16884,8 +16884,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, spi = i / BPF_REG_SIZE; if (exact != NOT_EXACT && - old->stack[spi].slot_type[i % BPF_REG_SIZE] != - cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + (i >= cur->allocated_stack || + old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE])) return false; if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ) diff --git a/kernel/cpu.c b/kernel/cpu.c index 1209ddaec026..b1fd2a3db91a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2689,6 +2689,16 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } +/** + * Check if the core a CPU belongs to is online + */ +#if !defined(topology_is_core_online) +static inline bool topology_is_core_online(unsigned int cpu) +{ + return true; +} +#endif + int cpuhp_smt_enable(void) { int cpu, ret = 0; @@ -2699,7 +2709,7 @@ int cpuhp_smt_enable(void) /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) continue; - if (!cpu_smt_thread_allowed(cpu)) + if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu)) continue; ret = _cpu_up(cpu, 0, CPUHP_ONLINE); if (ret) diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index d3b4cd12bdd1..64d44a52c011 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -423,7 +423,8 @@ retry: if (high && search_end == CRASH_ADDR_HIGH_MAX) { search_end = CRASH_ADDR_LOW_MAX; search_base = 0; - goto retry; + if (search_end != CRASH_ADDR_HIGH_MAX) + goto retry; } pr_warn("cannot allocate crashkernel (size:0x%llx)\n", crash_size); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index a6e3792b15f8..d570535342cb 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -416,8 +416,11 @@ static unsigned long long phys_addr(struct dma_debug_entry *entry) * dma_active_cacheline entry to track per event. dma_map_sg(), on the * other hand, consumes a single dma_debug_entry, but inserts 'nents' * entries into the tree. + * + * Use __GFP_NOWARN because the printk from an OOM, to netconsole, could end + * up right back in the DMA debugging code, leading to a deadlock. */ -static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC); +static RADIX_TREE(dma_active_cacheline, GFP_ATOMIC | __GFP_NOWARN); static DEFINE_SPINLOCK(radix_lock); #define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) #define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) diff --git a/kernel/events/core.c b/kernel/events/core.c index 0d9c4985b0aa..c973e3c11e03 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9328,21 +9328,19 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; int i; - if (prog->aux->func_cnt == 0) { - perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, - (u64)(unsigned long)prog->bpf_func, - prog->jited_len, unregister, - prog->aux->ksym.name); - } else { - for (i = 0; i < prog->aux->func_cnt; i++) { - struct bpf_prog *subprog = prog->aux->func[i]; - - perf_event_ksymbol( - PERF_RECORD_KSYMBOL_TYPE_BPF, - (u64)(unsigned long)subprog->bpf_func, - subprog->jited_len, unregister, - subprog->aux->ksym.name); - } + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, + prog->aux->ksym.name); + + for (i = 1; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, + subprog->aux->ksym.name); } } @@ -9708,7 +9706,8 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); - if (event->prog && !bpf_overflow_handler(event, data, regs)) + if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && + !bpf_overflow_handler(event, data, regs)) return ret; /* diff --git a/kernel/fork.c b/kernel/fork.c index cc760491f201..18bdc87209d0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2053,11 +2053,24 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - bool thread = flags & PIDFD_THREAD; - - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) + if (!pid) return -EINVAL; + scoped_guard(rcu) { + struct task_struct *tsk; + + if (flags & PIDFD_THREAD) + tsk = pid_task(pid, PIDTYPE_PID); + else + tsk = pid_task(pid, PIDTYPE_TGID); + if (!tsk) + return -EINVAL; + + /* Don't create pidfds for kernel threads for now. */ + if (tsk->flags & PF_KTHREAD) + return -EINVAL; + } + return __pidfd_prepare(pid, flags, ret); } @@ -2403,6 +2416,12 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; + /* Don't create pidfds for kernel threads for now. */ + if (args->kthread) { + retval = -EINVAL; + goto bad_fork_free_pid; + } + /* Note that no task has been attached to @pid yet. */ retval = __pidfd_prepare(pid, flags, &pidfile); if (retval < 0) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 07e99c936ba5..1dee88ba0ae4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -530,6 +530,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, flags = IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN; } + flags |= IRQD_AFFINITY_SET; mask = &affinity->mask; node = cpu_to_node(cpumask_first(mask)); affinity++; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 4ad5ed8adf96..6dc76b590703 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -236,7 +236,7 @@ void static_key_disable_cpuslocked(struct static_key *key) } jump_label_lock(); - if (atomic_cmpxchg(&key->enabled, 1, 0)) + if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); jump_label_unlock(); } @@ -289,7 +289,7 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key) return; guard(mutex)(&jump_label_mutex); - if (atomic_cmpxchg(&key->enabled, 1, 0)) + if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); else WARN_ON_ONCE(!static_key_slow_try_dec(key)); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index fb2c77368d18..a9a0ca605d4a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -160,38 +160,6 @@ unsigned long kallsyms_sym_address(int idx) return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } -static void cleanup_symbol_name(char *s) -{ - char *res; - - if (!IS_ENABLED(CONFIG_LTO_CLANG)) - return; - - /* - * LLVM appends various suffixes for local functions and variables that - * must be promoted to global scope as part of LTO. This can break - * hooking of static functions with kprobes. '.' is not a valid - * character in an identifier in C. Suffixes only in LLVM LTO observed: - * - foo.llvm.[0-9a-f]+ - */ - res = strstr(s, ".llvm."); - if (res) - *res = '\0'; - - return; -} - -static int compare_symbol_name(const char *name, char *namebuf) -{ - /* The kallsyms_seqs_of_names is sorted based on names after - * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled. - * To ensure correct bisection in kallsyms_lookup_names(), do - * cleanup_symbol_name(namebuf) before comparing name and namebuf. - */ - cleanup_symbol_name(namebuf); - return strcmp(name, namebuf); -} - static unsigned int get_symbol_seq(int index) { unsigned int i, seq = 0; @@ -219,7 +187,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(mid); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = compare_symbol_name(name, namebuf); + ret = strcmp(name, namebuf); if (ret > 0) low = mid + 1; else if (ret < 0) @@ -236,7 +204,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(low - 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - if (compare_symbol_name(name, namebuf)) + if (strcmp(name, namebuf)) break; low--; } @@ -248,7 +216,7 @@ static int kallsyms_lookup_names(const char *name, seq = get_symbol_seq(high + 1); off = get_symbol_offset(seq); kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - if (compare_symbol_name(name, namebuf)) + if (strcmp(name, namebuf)) break; high++; } @@ -407,8 +375,7 @@ static int kallsyms_lookup_buildid(unsigned long addr, if (modbuildid) *modbuildid = NULL; - ret = strlen(namebuf); - goto found; + return strlen(namebuf); } /* See if it's in a module or a BPF JITed image. */ @@ -422,8 +389,6 @@ static int kallsyms_lookup_buildid(unsigned long addr, ret = ftrace_mod_address_lookup(addr, symbolsize, offset, modname, namebuf); -found: - cleanup_symbol_name(namebuf); return ret; } @@ -450,8 +415,6 @@ const char *kallsyms_lookup(unsigned long addr, int lookup_symbol_name(unsigned long addr, char *symname) { - int res; - symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; @@ -462,16 +425,10 @@ int lookup_symbol_name(unsigned long addr, char *symname) /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), symname, KSYM_NAME_LEN); - goto found; + return 0; } /* See if it's in a module. */ - res = lookup_module_symbol_name(addr, symname); - if (res) - return res; - -found: - cleanup_symbol_name(symname); - return 0; + return lookup_module_symbol_name(addr, symname); } /* Look up a kernel symbol and return it in a text buffer. */ diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index 2f84896a7bcb..873f7c445488 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -187,31 +187,11 @@ static void test_perf_kallsyms_lookup_name(void) stat.min, stat.max, div_u64(stat.sum, stat.real_cnt)); } -static bool match_cleanup_name(const char *s, const char *name) -{ - char *p; - int len; - - if (!IS_ENABLED(CONFIG_LTO_CLANG)) - return false; - - p = strstr(s, ".llvm."); - if (!p) - return false; - - len = strlen(name); - if (p - s != len) - return false; - - return !strncmp(s, name, len); -} - static int find_symbol(void *data, const char *name, unsigned long addr) { struct test_stat *stat = (struct test_stat *)data; - if (strcmp(name, stat->name) == 0 || - (!stat->perf && match_cleanup_name(name, stat->name))) { + if (!strcmp(name, stat->name)) { stat->real_cnt++; stat->addr = addr; diff --git a/kernel/kcov.c b/kernel/kcov.c index f0a69d402066..274b6b7c718d 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -161,6 +161,15 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, kmsan_unpoison_memory(&area->list, sizeof(area->list)); } +/* + * Unlike in_serving_softirq(), this function returns false when called during + * a hardirq or an NMI that happened in the softirq context. + */ +static inline bool in_softirq_really(void) +{ + return in_serving_softirq() && !in_hardirq() && !in_nmi(); +} + static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { unsigned int mode; @@ -170,7 +179,7 @@ static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_stru * so we ignore code executed in interrupts, unless we are in a remote * coverage collection section in a softirq. */ - if (!in_task() && !(in_serving_softirq() && t->kcov_softirq)) + if (!in_task() && !(in_softirq_really() && t->kcov_softirq)) return false; mode = READ_ONCE(t->kcov_mode); /* @@ -849,7 +858,7 @@ void kcov_remote_start(u64 handle) if (WARN_ON(!kcov_check_handle(handle, true, true, true))) return; - if (!in_task() && !in_serving_softirq()) + if (!in_task() && !in_softirq_really()) return; local_lock_irqsave(&kcov_percpu_data.lock, flags); @@ -991,7 +1000,7 @@ void kcov_remote_stop(void) int sequence; unsigned long flags; - if (!in_task() && !in_serving_softirq()) + if (!in_task() && !in_softirq_really()) return; local_lock_irqsave(&kcov_percpu_data.lock, flags); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e85de37d9e1e..da59c68df841 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1557,8 +1557,8 @@ static bool is_cfi_preamble_symbol(unsigned long addr) if (lookup_symbol_name(addr, symbuf)) return false; - return str_has_prefix("__cfi_", symbuf) || - str_has_prefix("__pfx_", symbuf); + return str_has_prefix(symbuf, "__cfi_") || + str_has_prefix(symbuf, "__pfx_"); } static int check_kprobe_address_safe(struct kprobe *p, diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 07fb5987b42b..1bab21b4718f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -92,7 +92,14 @@ static ssize_t profiling_store(struct kobject *kobj, const char *buf, size_t count) { int ret; + static DEFINE_MUTEX(lock); + /* + * We need serialization, for profile_setup() initializes prof_on + * value and profile_init() must not reallocate prof_buffer after + * once allocated. + */ + guard(mutex)(&lock); if (prof_on) return -EEXIST; /* diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 58c88220a478..0349f957e672 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5936,6 +5936,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; + if (unlikely(lock->key == &__lockdep_no_track__)) + return; + hlock = find_held_lock(curr, lock, depth, &i); if (!hlock) { print_lock_contention_bug(curr, lock, ip); @@ -5978,6 +5981,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; + if (unlikely(lock->key == &__lockdep_no_track__)) + return; + hlock = find_held_lock(curr, lock, depth, &i); if (!hlock) { print_lock_contention_bug(curr, lock, _RET_IP_); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index f5a36e67b593..ac2e22502741 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -357,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - enum vcpu_state old = vcpu_halted; + u8 old = vcpu_halted; /* * If the vCPU is indeed halted, advance its state to match that of * pv_wait_node(). If OTOH this fails, the vCPU was running and will diff --git a/kernel/module/main.c b/kernel/module/main.c index d9592195c5bb..71396e297499 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3104,7 +3104,7 @@ static bool idempotent(struct idempotent *u, const void *cookie) struct idempotent *existing; bool first; - u->ret = 0; + u->ret = -EINTR; u->cookie = cookie; init_completion(&u->complete); @@ -3140,7 +3140,7 @@ static int idempotent_complete(struct idempotent *u, int ret) hlist_for_each_entry_safe(pos, next, head, entry) { if (pos->cookie != cookie) continue; - hlist_del(&pos->entry); + hlist_del_init(&pos->entry); pos->ret = ret; complete(&pos->complete); } @@ -3148,6 +3148,28 @@ static int idempotent_complete(struct idempotent *u, int ret) return ret; } +/* + * Wait for the idempotent worker. + * + * If we get interrupted, we need to remove ourselves from the + * the idempotent list, and the completion may still come in. + * + * The 'idem_lock' protects against the race, and 'idem.ret' was + * initialized to -EINTR and is thus always the right return + * value even if the idempotent work then completes between + * the wait_for_completion and the cleanup. + */ +static int idempotent_wait_for_completion(struct idempotent *u) +{ + if (wait_for_completion_interruptible(&u->complete)) { + spin_lock(&idem_lock); + if (!hlist_unhashed(&u->entry)) + hlist_del(&u->entry); + spin_unlock(&idem_lock); + } + return u->ret; +} + static int init_module_from_file(struct file *f, const char __user * uargs, int flags) { struct load_info info = { }; @@ -3183,15 +3205,16 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int if (!f || !(f->f_mode & FMODE_READ)) return -EBADF; - /* See if somebody else is doing the operation? */ - if (idempotent(&idem, file_inode(f))) { - wait_for_completion(&idem.complete); - return idem.ret; + /* Are we the winners of the race and get to do this? */ + if (!idempotent(&idem, file_inode(f))) { + int ret = init_module_from_file(f, uargs, flags); + return idempotent_complete(&idem, ret); } - /* Otherwise, we'll do it and complete others */ - return idempotent_complete(&idem, - init_module_from_file(f, uargs, flags)); + /* + * Somebody else won the race and is loading the module. + */ + return idempotent_wait_for_completion(&idem); } SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) diff --git a/kernel/padata.c b/kernel/padata.c index 53f4bc912712..0fa6c2895460 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -517,6 +517,13 @@ void __init padata_do_multithreaded(struct padata_mt_job *job) ps.chunk_size = max(ps.chunk_size, job->min_chunk); ps.chunk_size = roundup(ps.chunk_size, job->align); + /* + * chunk_size can be 0 if the caller sets min_chunk to 0. So force it + * to at least 1 to prevent divide-by-0 panic in padata_mt_helper().` + */ + if (!ps.chunk_size) + ps.chunk_size = 1U; + list_for_each_entry(pw, &works, pw_list) if (job->numa_aware) { int old_node = atomic_read(&last_used_nid); diff --git a/kernel/profile.c b/kernel/profile.c index 2b775cc5c28f..1fcf1adcf4eb 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -47,30 +47,14 @@ static unsigned short int prof_shift; int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); -static cpumask_var_t prof_cpu_mask; -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); -static DEFINE_PER_CPU(int, cpu_profile_flip); -static DEFINE_MUTEX(profile_flip_mutex); -#endif /* CONFIG_SMP */ - int profile_setup(char *str) { static const char schedstr[] = "schedule"; - static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; const char *select = NULL; int par; - if (!strncmp(str, sleepstr, strlen(sleepstr))) { -#ifdef CONFIG_SCHEDSTATS - force_schedstat_enabled(); - prof_on = SLEEP_PROFILING; - select = sleepstr; -#else - pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); -#endif /* CONFIG_SCHEDSTATS */ - } else if (!strncmp(str, schedstr, strlen(schedstr))) { + if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { @@ -114,11 +98,6 @@ int __ref profile_init(void) buffer_bytes = prof_len*sizeof(atomic_t); - if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(prof_cpu_mask, cpu_possible_mask); - prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); if (prof_buffer) return 0; @@ -132,195 +111,16 @@ int __ref profile_init(void) if (prof_buffer) return 0; - free_cpumask_var(prof_cpu_mask); return -ENOMEM; } -#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) -/* - * Each cpu has a pair of open-addressed hashtables for pending - * profile hits. read_profile() IPI's all cpus to request them - * to flip buffers and flushes their contents to prof_buffer itself. - * Flip requests are serialized by the profile_flip_mutex. The sole - * use of having a second hashtable is for avoiding cacheline - * contention that would otherwise happen during flushes of pending - * profile hits required for the accuracy of reported profile hits - * and so resurrect the interrupt livelock issue. - * - * The open-addressed hashtables are indexed by profile buffer slot - * and hold the number of pending hits to that profile buffer slot on - * a cpu in an entry. When the hashtable overflows, all pending hits - * are accounted to their corresponding profile buffer slots with - * atomic_add() and the hashtable emptied. As numerous pending hits - * may be accounted to a profile buffer slot in a hashtable entry, - * this amortizes a number of atomic profile buffer increments likely - * to be far larger than the number of entries in the hashtable, - * particularly given that the number of distinct profile buffer - * positions to which hits are accounted during short intervals (e.g. - * several seconds) is usually very small. Exclusion from buffer - * flipping is provided by interrupt disablement (note that for - * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from - * process context). - * The hash function is meant to be lightweight as opposed to strong, - * and was vaguely inspired by ppc64 firmware-supported inverted - * pagetable hash functions, but uses a full hashtable full of finite - * collision chains, not just pairs of them. - * - * -- nyc - */ -static void __profile_flip_buffers(void *unused) -{ - int cpu = smp_processor_id(); - - per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); -} - -static void profile_flip_buffers(void) -{ - int i, j, cpu; - - mutex_lock(&profile_flip_mutex); - j = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; - for (i = 0; i < NR_PROFILE_HIT; ++i) { - if (!hits[i].hits) { - if (hits[i].pc) - hits[i].pc = 0; - continue; - } - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].hits = hits[i].pc = 0; - } - } - mutex_unlock(&profile_flip_mutex); -} - -static void profile_discard_flip_buffers(void) -{ - int i, cpu; - - mutex_lock(&profile_flip_mutex); - i = per_cpu(cpu_profile_flip, get_cpu()); - put_cpu(); - on_each_cpu(__profile_flip_buffers, NULL, 1); - for_each_online_cpu(cpu) { - struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; - memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); - } - mutex_unlock(&profile_flip_mutex); -} - -static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) -{ - unsigned long primary, secondary, flags, pc = (unsigned long)__pc; - int i, j, cpu; - struct profile_hit *hits; - - pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); - i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; - cpu = get_cpu(); - hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; - if (!hits) { - put_cpu(); - return; - } - /* - * We buffer the global profiler buffer into a per-CPU - * queue and thus reduce the number of global (and possibly - * NUMA-alien) accesses. The write-queue is self-coalescing: - */ - local_irq_save(flags); - do { - for (j = 0; j < PROFILE_GRPSZ; ++j) { - if (hits[i + j].pc == pc) { - hits[i + j].hits += nr_hits; - goto out; - } else if (!hits[i + j].hits) { - hits[i + j].pc = pc; - hits[i + j].hits = nr_hits; - goto out; - } - } - i = (i + secondary) & (NR_PROFILE_HIT - 1); - } while (i != primary); - - /* - * Add the current hit(s) and flush the write-queue out - * to the global buffer: - */ - atomic_add(nr_hits, &prof_buffer[pc]); - for (i = 0; i < NR_PROFILE_HIT; ++i) { - atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); - hits[i].pc = hits[i].hits = 0; - } -out: - local_irq_restore(flags); - put_cpu(); -} - -static int profile_dead_cpu(unsigned int cpu) -{ - struct page *page; - int i; - - if (cpumask_available(prof_cpu_mask)) - cpumask_clear_cpu(cpu, prof_cpu_mask); - - for (i = 0; i < 2; i++) { - if (per_cpu(cpu_profile_hits, cpu)[i]) { - page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[i]); - per_cpu(cpu_profile_hits, cpu)[i] = NULL; - __free_page(page); - } - } - return 0; -} - -static int profile_prepare_cpu(unsigned int cpu) -{ - int i, node = cpu_to_mem(cpu); - struct page *page; - - per_cpu(cpu_profile_flip, cpu) = 0; - - for (i = 0; i < 2; i++) { - if (per_cpu(cpu_profile_hits, cpu)[i]) - continue; - - page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) { - profile_dead_cpu(cpu); - return -ENOMEM; - } - per_cpu(cpu_profile_hits, cpu)[i] = page_address(page); - - } - return 0; -} - -static int profile_online_cpu(unsigned int cpu) -{ - if (cpumask_available(prof_cpu_mask)) - cpumask_set_cpu(cpu, prof_cpu_mask); - - return 0; -} - -#else /* !CONFIG_SMP */ -#define profile_flip_buffers() do { } while (0) -#define profile_discard_flip_buffers() do { } while (0) - static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; - atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); + if (pc < prof_len) + atomic_add(nr_hits, &prof_buffer[pc]); } -#endif /* !CONFIG_SMP */ void profile_hits(int type, void *__pc, unsigned int nr_hits) { @@ -334,8 +134,8 @@ void profile_tick(int type) { struct pt_regs *regs = get_irq_regs(); - if (!user_mode(regs) && cpumask_available(prof_cpu_mask) && - cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) + /* This is the old kernel-only legacy profiling */ + if (!user_mode(regs)) profile_hit(type, (void *)profile_pc(regs)); } @@ -358,7 +158,6 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) char *pnt; unsigned long sample_step = 1UL << prof_shift; - profile_flip_buffers(); if (p >= (prof_len+1)*sizeof(unsigned int)) return 0; if (count > (prof_len+1)*sizeof(unsigned int) - p) @@ -404,7 +203,6 @@ static ssize_t write_profile(struct file *file, const char __user *buf, return -EINVAL; } #endif - profile_discard_flip_buffers(); memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); return count; } @@ -418,40 +216,14 @@ static const struct proc_ops profile_proc_ops = { int __ref create_proc_profile(void) { struct proc_dir_entry *entry; -#ifdef CONFIG_SMP - enum cpuhp_state online_state; -#endif - int err = 0; if (!prof_on) return 0; -#ifdef CONFIG_SMP - err = cpuhp_setup_state(CPUHP_PROFILE_PREPARE, "PROFILE_PREPARE", - profile_prepare_cpu, profile_dead_cpu); - if (err) - return err; - - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "AP_PROFILE_ONLINE", - profile_online_cpu, NULL); - if (err < 0) - goto err_state_prep; - online_state = err; - err = 0; -#endif entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &profile_proc_ops); - if (!entry) - goto err_state_onl; - proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); - - return err; -err_state_onl: -#ifdef CONFIG_SMP - cpuhp_remove_state(online_state); -err_state_prep: - cpuhp_remove_state(CPUHP_PROFILE_PREPARE); -#endif + if (entry) + proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t)); return err; } subsys_initcall(create_proc_profile); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a9f655025607..f3951e4a55e5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7845,6 +7845,30 @@ void set_rq_offline(struct rq *rq) } } +static inline void sched_set_rq_online(struct rq *rq, int cpu) +{ + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + rq_unlock_irqrestore(rq, &rf); +} + +static inline void sched_set_rq_offline(struct rq *rq, int cpu) +{ + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + rq_unlock_irqrestore(rq, &rf); +} + /* * used to mark begin/end of suspend/resume: */ @@ -7895,10 +7919,25 @@ static int cpuset_cpu_inactive(unsigned int cpu) return 0; } +static inline void sched_smt_present_inc(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_inc_cpuslocked(&sched_smt_present); +#endif +} + +static inline void sched_smt_present_dec(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(&sched_smt_present); +#endif +} + int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; /* * Clear the balance_push callback and prepare to schedule @@ -7906,13 +7945,10 @@ int sched_cpu_activate(unsigned int cpu) */ balance_push_set(cpu, false); -#ifdef CONFIG_SCHED_SMT /* * When going up, increment the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) - static_branch_inc_cpuslocked(&sched_smt_present); -#endif + sched_smt_present_inc(cpu); set_cpu_active(cpu, true); if (sched_smp_initialized) { @@ -7930,12 +7966,7 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_online(rq); - } - rq_unlock_irqrestore(rq, &rf); + sched_set_rq_online(rq, cpu); return 0; } @@ -7943,7 +7974,6 @@ int sched_cpu_activate(unsigned int cpu) int sched_cpu_deactivate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; int ret; /* @@ -7974,20 +8004,14 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu(); - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - rq_unlock_irqrestore(rq, &rf); + sched_set_rq_offline(rq, cpu); -#ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) - static_branch_dec_cpuslocked(&sched_smt_present); + sched_smt_present_dec(cpu); +#ifdef CONFIG_SCHED_SMT sched_core_cpu_deactivate(cpu); #endif @@ -7997,6 +8021,8 @@ int sched_cpu_deactivate(unsigned int cpu) sched_update_numa(cpu, false); ret = cpuset_cpu_inactive(cpu); if (ret) { + sched_smt_present_inc(cpu); + sched_set_rq_online(rq, cpu); balance_push_set(cpu, false); set_cpu_active(cpu, true); sched_update_numa(cpu, true); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a5e00293ae43..0bed0fa1acd9 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -582,6 +582,12 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, } stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); + /* + * Because mul_u64_u64_div_u64() can approximate on some + * achitectures; enforce the constraint that: a*b/(b+c) <= a. + */ + if (unlikely(stime > rtime)) + stime = rtime; update: /* diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 78e48f5426ee..eb0cdcd4d921 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -92,16 +92,6 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, trace_sched_stat_blocked(p, delta); - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(p), - delta >> 20); - } account_scheduler_latency(p, delta >> 10, 0); } } diff --git a/kernel/task_work.c b/kernel/task_work.c index 5c2daa7ad3f9..5d14d639ac71 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -6,12 +6,14 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +#ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); +#endif /** * task_work_add - ask the @task to execute @work->func() @@ -57,6 +59,8 @@ int task_work_add(struct task_struct *task, struct callback_head *work, if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; + if (!IS_ENABLED(CONFIG_IRQ_WORK)) + return -EINVAL; } else { /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); @@ -81,9 +85,11 @@ int task_work_add(struct task_struct *task, struct callback_head *work, case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; +#ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; +#endif default: WARN_ON_ONCE(1); break; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d25ba49e313c..d0538a75f4c6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -246,7 +246,7 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); if (wd_delay <= WATCHDOG_MAX_SKEW) { - if (nretries > 1 || nretries >= max_retries) { + if (nretries > 1 && nretries >= max_retries) { pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", smp_processor_id(), watchdog->name, nretries); } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 406dccb79c2b..8d2dd214ec68 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -727,17 +727,16 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, } if (txc->modes & ADJ_MAXERROR) - time_maxerror = txc->maxerror; + time_maxerror = clamp(txc->maxerror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_ESTERROR) - time_esterror = txc->esterror; + time_esterror = clamp(txc->esterror, 0, NTP_PHASE_LIMIT); if (txc->modes & ADJ_TIMECONST) { - time_constant = txc->constant; + time_constant = clamp(txc->constant, 0, MAXTC); if (!(time_status & STA_NANO)) time_constant += 4; - time_constant = min(time_constant, (long)MAXTC); - time_constant = max(time_constant, 0l); + time_constant = clamp(time_constant, 0, MAXTC); } if (txc->modes & ADJ_TAI && diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b4843099a8da..ed58eebb4e8f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1141,7 +1141,6 @@ void tick_broadcast_switch_to_oneshot(void) #ifdef CONFIG_HOTPLUG_CPU void hotplug_cpu__broadcast_tick_pull(int deadcpu) { - struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *bc; unsigned long flags; @@ -1167,6 +1166,8 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) * device to avoid the starvation. */ if (tick_check_broadcast_expired()) { + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask); tick_program_event(td->evtdev->next_event, 1); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2fa87dcfeda9..5391e4167d60 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2606,7 +2606,7 @@ int do_adjtimex(struct __kernel_timex *txc) clock_set |= timekeeping_advance(TK_ADV_FREQ); if (clock_set) - clock_was_set(CLOCK_REALTIME); + clock_was_set(CLOCK_SET_WALL); ntp_notify_cmos_timer(); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 84413114db5c..8d57f7686bb0 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -475,9 +475,54 @@ static bool tmigr_check_lonely(struct tmigr_group *group) return bitmap_weight(&active, BIT_CNT) <= 1; } -typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, void *); +/** + * struct tmigr_walk - data required for walking the hierarchy + * @nextexp: Next CPU event expiry information which is handed into + * the timer migration code by the timer code + * (get_next_timer_interrupt()) + * @firstexp: Contains the first event expiry information when + * hierarchy is completely idle. When CPU itself was the + * last going idle, information makes sure, that CPU will + * be back in time. When using this value in the remote + * expiry case, firstexp is stored in the per CPU tmigr_cpu + * struct of CPU which expires remote timers. It is updated + * in top level group only. Be aware, there could occur a + * new top level of the hierarchy between the 'top level + * call' in tmigr_update_events() and the check for the + * parent group in walk_groups(). Then @firstexp might + * contain a value != KTIME_MAX even if it was not the + * final top level. This is not a problem, as the worst + * outcome is a CPU which might wake up a little early. + * @evt: Pointer to tmigr_event which needs to be queued (of idle + * child group) + * @childmask: groupmask of child group + * @remote: Is set, when the new timer path is executed in + * tmigr_handle_remote_cpu() + * @basej: timer base in jiffies + * @now: timer base monotonic + * @check: is set if there is the need to handle remote timers; + * required in tmigr_requires_handle_remote() only + * @tmc_active: this flag indicates, whether the CPU which triggers + * the hierarchy walk is !idle in the timer migration + * hierarchy. When the CPU is idle and the whole hierarchy is + * idle, only the first event of the top level has to be + * considered. + */ +struct tmigr_walk { + u64 nextexp; + u64 firstexp; + struct tmigr_event *evt; + u8 childmask; + bool remote; + unsigned long basej; + u64 now; + bool check; + bool tmc_active; +}; + +typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); -static void __walk_groups(up_f up, void *data, +static void __walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { struct tmigr_group *child = NULL, *group = tmc->tmgroup; @@ -490,64 +535,17 @@ static void __walk_groups(up_f up, void *data, child = group; group = group->parent; + data->childmask = child->groupmask; } while (group); } -static void walk_groups(up_f up, void *data, struct tmigr_cpu *tmc) +static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { lockdep_assert_held(&tmc->lock); __walk_groups(up, data, tmc); } -/** - * struct tmigr_walk - data required for walking the hierarchy - * @nextexp: Next CPU event expiry information which is handed into - * the timer migration code by the timer code - * (get_next_timer_interrupt()) - * @firstexp: Contains the first event expiry information when last - * active CPU of hierarchy is on the way to idle to make - * sure CPU will be back in time. - * @evt: Pointer to tmigr_event which needs to be queued (of idle - * child group) - * @childmask: childmask of child group - * @remote: Is set, when the new timer path is executed in - * tmigr_handle_remote_cpu() - */ -struct tmigr_walk { - u64 nextexp; - u64 firstexp; - struct tmigr_event *evt; - u8 childmask; - bool remote; -}; - -/** - * struct tmigr_remote_data - data required for remote expiry hierarchy walk - * @basej: timer base in jiffies - * @now: timer base monotonic - * @firstexp: returns expiry of the first timer in the idle timer - * migration hierarchy to make sure the timer is handled in - * time; it is stored in the per CPU tmigr_cpu struct of - * CPU which expires remote timers - * @childmask: childmask of child group - * @check: is set if there is the need to handle remote timers; - * required in tmigr_requires_handle_remote() only - * @tmc_active: this flag indicates, whether the CPU which triggers - * the hierarchy walk is !idle in the timer migration - * hierarchy. When the CPU is idle and the whole hierarchy is - * idle, only the first event of the top level has to be - * considered. - */ -struct tmigr_remote_data { - unsigned long basej; - u64 now; - u64 firstexp; - u8 childmask; - bool check; - bool tmc_active; -}; - /* * Returns the next event of the timerqueue @group->events * @@ -618,10 +616,9 @@ static u64 tmigr_next_groupevt_expires(struct tmigr_group *group) static bool tmigr_active_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { union tmigr_state curstate, newstate; - struct tmigr_walk *data = ptr; bool walk_done; u8 childmask; @@ -649,8 +646,7 @@ static bool tmigr_active_up(struct tmigr_group *group, } while (!atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)); - if ((walk_done == false) && group->parent) - data->childmask = group->childmask; + trace_tmigr_group_set_cpu_active(group, newstate, childmask); /* * The group is active (again). The group event might be still queued @@ -666,8 +662,6 @@ static bool tmigr_active_up(struct tmigr_group *group, */ group->groupevt.ignore = true; - trace_tmigr_group_set_cpu_active(group, newstate, childmask); - return walk_done; } @@ -675,7 +669,7 @@ static void __tmigr_cpu_activate(struct tmigr_cpu *tmc) { struct tmigr_walk data; - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; trace_tmigr_cpu_active(tmc); @@ -860,10 +854,8 @@ unlock: static bool tmigr_new_timer_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_walk *data = ptr; - return tmigr_update_events(group, child, data); } @@ -995,9 +987,8 @@ unlock: static bool tmigr_handle_remote_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_remote_data *data = ptr; struct tmigr_event *evt; unsigned long jif; u8 childmask; @@ -1034,12 +1025,10 @@ again: } /* - * Update of childmask for the next level and keep track of the expiry - * of the first event that needs to be handled (group->next_expiry was - * updated by tmigr_next_expired_groupevt(), next was set by - * tmigr_handle_remote_cpu()). + * Keep track of the expiry of the first event that needs to be handled + * (group->next_expiry was updated by tmigr_next_expired_groupevt(), + * next was set by tmigr_handle_remote_cpu()). */ - data->childmask = group->childmask; data->firstexp = group->next_expiry; raw_spin_unlock_irq(&group->lock); @@ -1055,12 +1044,12 @@ again: void tmigr_handle_remote(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - struct tmigr_remote_data data; + struct tmigr_walk data; if (tmigr_is_not_available(tmc)) return; - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; /* @@ -1068,7 +1057,7 @@ void tmigr_handle_remote(void) * in tmigr_handle_remote_up() anyway. Keep this check to speed up the * return when nothing has to be done. */ - if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) { + if (!tmigr_check_migrator(tmc->tmgroup, tmc->groupmask)) { /* * If this CPU was an idle migrator, make sure to clear its wakeup * value so it won't chase timers that have already expired elsewhere. @@ -1097,9 +1086,8 @@ void tmigr_handle_remote(void) static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { - struct tmigr_remote_data *data = ptr; u8 childmask; childmask = data->childmask; @@ -1118,7 +1106,7 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, * group before reading the next_expiry value. */ if (group->parent && !data->tmc_active) - goto out; + return false; /* * The lock is required on 32bit architectures to read the variable @@ -1143,9 +1131,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, raw_spin_unlock(&group->lock); } -out: - /* Update of childmask for the next level */ - data->childmask = group->childmask; return false; } @@ -1157,7 +1142,7 @@ out: bool tmigr_requires_handle_remote(void) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - struct tmigr_remote_data data; + struct tmigr_walk data; unsigned long jif; bool ret = false; @@ -1165,7 +1150,7 @@ bool tmigr_requires_handle_remote(void) return ret; data.now = get_jiffies_update(&jif); - data.childmask = tmc->childmask; + data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; data.tmc_active = !tmc->idle; data.check = false; @@ -1230,14 +1215,13 @@ u64 tmigr_cpu_new_timer(u64 nextexp) if (nextexp != tmc->cpuevt.nextevt.expires || tmc->cpuevt.ignore) { ret = tmigr_new_timer(tmc, nextexp); + /* + * Make sure the reevaluation of timers in idle path + * will not miss an event. + */ + WRITE_ONCE(tmc->wakeup, ret); } } - /* - * Make sure the reevaluation of timers in idle path will not miss an - * event. - */ - WRITE_ONCE(tmc->wakeup, ret); - trace_tmigr_cpu_new_timer_idle(tmc, nextexp); raw_spin_unlock(&tmc->lock); return ret; @@ -1245,10 +1229,9 @@ u64 tmigr_cpu_new_timer(u64 nextexp) static bool tmigr_inactive_up(struct tmigr_group *group, struct tmigr_group *child, - void *ptr) + struct tmigr_walk *data) { union tmigr_state curstate, newstate, childstate; - struct tmigr_walk *data = ptr; bool walk_done; u8 childmask; @@ -1299,9 +1282,10 @@ static bool tmigr_inactive_up(struct tmigr_group *group, WARN_ON_ONCE((newstate.migrator != TMIGR_NONE) && !(newstate.active)); - if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, - newstate.state)) + if (atomic_try_cmpxchg(&group->migr_state, &curstate.state, newstate.state)) { + trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); break; + } /* * The memory barrier is paired with the cmpxchg() in @@ -1317,22 +1301,6 @@ static bool tmigr_inactive_up(struct tmigr_group *group, /* Event Handling */ tmigr_update_events(group, child, data); - if (group->parent && (walk_done == false)) - data->childmask = group->childmask; - - /* - * data->firstexp was set by tmigr_update_events() and contains the - * expiry of the first global event which needs to be handled. It - * differs from KTIME_MAX if: - * - group is the top level group and - * - group is idle (which means CPU was the last active CPU in the - * hierarchy) and - * - there is a pending event in the hierarchy - */ - WARN_ON_ONCE(data->firstexp != KTIME_MAX && group->parent); - - trace_tmigr_group_set_cpu_inactive(group, newstate, childmask); - return walk_done; } @@ -1341,7 +1309,7 @@ static u64 __tmigr_cpu_deactivate(struct tmigr_cpu *tmc, u64 nextexp) struct tmigr_walk data = { .nextexp = nextexp, .firstexp = KTIME_MAX, .evt = &tmc->cpuevt, - .childmask = tmc->childmask }; + .childmask = tmc->groupmask }; /* * If nextexp is KTIME_MAX, the CPU event will be ignored because the @@ -1400,7 +1368,7 @@ u64 tmigr_cpu_deactivate(u64 nextexp) * the only one in the level 0 group; and if it is the * only one in level 0 group, but there are more than a * single group active on the way to top level) - * * nextevt - when CPU is offline and has to handle timer on his own + * * nextevt - when CPU is offline and has to handle timer on its own * or when on the way to top in every group only a single * child is active but @nextevt is before the lowest * next_expiry encountered while walking up to top level. @@ -1419,7 +1387,7 @@ u64 tmigr_quick_check(u64 nextevt) if (WARN_ON_ONCE(tmc->idle)) return nextevt; - if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->childmask)) + if (!tmigr_check_migrator_and_lonely(tmc->tmgroup, tmc->groupmask)) return KTIME_MAX; do { @@ -1442,6 +1410,66 @@ u64 tmigr_quick_check(u64 nextevt) return KTIME_MAX; } +/* + * tmigr_trigger_active() - trigger a CPU to become active again + * + * This function is executed on a CPU which is part of cpu_online_mask, when the + * last active CPU in the hierarchy is offlining. With this, it is ensured that + * the other CPU is active and takes over the migrator duty. + */ +static long tmigr_trigger_active(void *unused) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + WARN_ON_ONCE(!tmc->online || tmc->idle); + + return 0; +} + +static int tmigr_cpu_offline(unsigned int cpu) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + int migrator; + u64 firstexp; + + raw_spin_lock_irq(&tmc->lock); + tmc->online = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_offline(tmc); + raw_spin_unlock_irq(&tmc->lock); + + if (firstexp != KTIME_MAX) { + migrator = cpumask_any_but(cpu_online_mask, cpu); + work_on_cpu(migrator, tmigr_trigger_active, NULL); + } + + return 0; +} + +static int tmigr_cpu_online(unsigned int cpu) +{ + struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + + /* Check whether CPU data was successfully initialized */ + if (WARN_ON_ONCE(!tmc->tmgroup)) + return -EINVAL; + + raw_spin_lock_irq(&tmc->lock); + trace_tmigr_cpu_online(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->online = true; + raw_spin_unlock_irq(&tmc->lock); + return 0; +} + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { @@ -1514,21 +1542,25 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, } static void tmigr_connect_child_parent(struct tmigr_group *child, - struct tmigr_group *parent) + struct tmigr_group *parent, + bool activate) { - union tmigr_state childstate; + struct tmigr_walk data; raw_spin_lock_irq(&child->lock); raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); child->parent = parent; - child->childmask = BIT(parent->num_children++); + child->groupmask = BIT(parent->num_children++); raw_spin_unlock(&parent->lock); raw_spin_unlock_irq(&child->lock); trace_tmigr_connect_child_parent(child); + if (!activate) + return; + /* * To prevent inconsistent states, active children need to be active in * the new parent as well. Inactive children are already marked inactive @@ -1544,21 +1576,24 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, * child to the new parent. So tmigr_connect_child_parent() is * executed with the formerly top level group (child) and the newly * created group (parent). + * + * * It is ensured that the child is active, as this setup path is + * executed in hotplug prepare callback. This is exectued by an + * already connected and !idle CPU. Even if all other CPUs go idle, + * the CPU executing the setup will be responsible up to current top + * level group. And the next time it goes inactive, it will release + * the new childmask and parent to subsequent walkers through this + * @child. Therefore propagate active state unconditionally. */ - childstate.state = atomic_read(&child->migr_state); - if (childstate.migrator != TMIGR_NONE) { - struct tmigr_walk data; - - data.childmask = child->childmask; + data.childmask = child->groupmask; - /* - * There is only one new level per time. When connecting the - * child and the parent and set the child active when the parent - * is inactive, the parent needs to be the uppermost - * level. Otherwise there went something wrong! - */ - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); - } + /* + * There is only one new level per time (which is protected by + * tmigr_mutex). When connecting the child and the parent and set the + * child active when the parent is inactive, the parent needs to be the + * uppermost level. Otherwise there went something wrong! + */ + WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } static int tmigr_setup_groups(unsigned int cpu, unsigned int node) @@ -1611,12 +1646,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) * Update tmc -> group / child -> group connection */ if (i == 0) { - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); raw_spin_lock_irq(&group->lock); tmc->tmgroup = group; - tmc->childmask = BIT(group->num_children++); + tmc->groupmask = BIT(group->num_children++); raw_spin_unlock_irq(&group->lock); @@ -1626,7 +1661,8 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) continue; } else { child = stack[i - 1]; - tmigr_connect_child_parent(child, group); + /* Will be activated at online time */ + tmigr_connect_child_parent(child, group, false); } /* check if uppermost level was newly created */ @@ -1637,12 +1673,21 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) lvllist = &tmigr_level_list[top]; if (group->num_children == 1 && list_is_singular(lvllist)) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + lvllist = &tmigr_level_list[top - 1]; list_for_each_entry(child, lvllist, list) { if (child->parent) continue; - tmigr_connect_child_parent(child, group); + tmigr_connect_child_parent(child, group, true); } } } @@ -1664,80 +1709,31 @@ static int tmigr_add_cpu(unsigned int cpu) return ret; } -static int tmigr_cpu_online(unsigned int cpu) -{ - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - int ret; - - /* First online attempt? Initialize CPU data */ - if (!tmc->tmgroup) { - raw_spin_lock_init(&tmc->lock); - - ret = tmigr_add_cpu(cpu); - if (ret < 0) - return ret; - - if (tmc->childmask == 0) - return -EINVAL; - - timerqueue_init(&tmc->cpuevt.nextevt); - tmc->cpuevt.nextevt.expires = KTIME_MAX; - tmc->cpuevt.ignore = true; - tmc->cpuevt.cpu = cpu; - - tmc->remote = false; - WRITE_ONCE(tmc->wakeup, KTIME_MAX); - } - raw_spin_lock_irq(&tmc->lock); - trace_tmigr_cpu_online(tmc); - tmc->idle = timer_base_is_idle(); - if (!tmc->idle) - __tmigr_cpu_activate(tmc); - tmc->online = true; - raw_spin_unlock_irq(&tmc->lock); - return 0; -} - -/* - * tmigr_trigger_active() - trigger a CPU to become active again - * - * This function is executed on a CPU which is part of cpu_online_mask, when the - * last active CPU in the hierarchy is offlining. With this, it is ensured that - * the other CPU is active and takes over the migrator duty. - */ -static long tmigr_trigger_active(void *unused) +static int tmigr_cpu_prepare(unsigned int cpu) { - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); + struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); + int ret = 0; - WARN_ON_ONCE(!tmc->online || tmc->idle); - - return 0; -} - -static int tmigr_cpu_offline(unsigned int cpu) -{ - struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - int migrator; - u64 firstexp; + /* Not first online attempt? */ + if (tmc->tmgroup) + return ret; - raw_spin_lock_irq(&tmc->lock); - tmc->online = false; + raw_spin_lock_init(&tmc->lock); + timerqueue_init(&tmc->cpuevt.nextevt); + tmc->cpuevt.nextevt.expires = KTIME_MAX; + tmc->cpuevt.ignore = true; + tmc->cpuevt.cpu = cpu; + tmc->remote = false; WRITE_ONCE(tmc->wakeup, KTIME_MAX); - /* - * CPU has to handle the local events on his own, when on the way to - * offline; Therefore nextevt value is set to KTIME_MAX - */ - firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); - trace_tmigr_cpu_offline(tmc); - raw_spin_unlock_irq(&tmc->lock); + ret = tmigr_add_cpu(cpu); + if (ret < 0) + return ret; - if (firstexp != KTIME_MAX) { - migrator = cpumask_any_but(cpu_online_mask, cpu); - work_on_cpu(migrator, tmigr_trigger_active, NULL); - } + if (tmc->groupmask == 0) + return -EINVAL; - return 0; + return ret; } static int __init tmigr_init(void) @@ -1796,6 +1792,11 @@ static int __init tmigr_init(void) tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, tmigr_crossnode_level); + ret = cpuhp_setup_state(CPUHP_TMIGR_PREPARE, "tmigr:prepare", + tmigr_cpu_prepare, NULL); + if (ret) + goto err; + ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", tmigr_cpu_online, tmigr_cpu_offline); if (ret) @@ -1807,4 +1808,4 @@ err: pr_err("Timer migration setup failed\n"); return ret; } -late_initcall(tmigr_init); +early_initcall(tmigr_init); diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 6c37d94a37d9..154accc7a543 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -22,7 +22,17 @@ struct tmigr_event { * struct tmigr_group - timer migration hierarchy group * @lock: Lock protecting the event information and group hierarchy * information during setup - * @parent: Pointer to the parent group + * @parent: Pointer to the parent group. Pointer is updated when a + * new hierarchy level is added because of a CPU coming + * online the first time. Once it is set, the pointer will + * not be removed or updated. When accessing parent pointer + * lock less to decide whether to abort a propagation or + * not, it is not a problem. The worst outcome is an + * unnecessary/early CPU wake up. But do not access parent + * pointer several times in the same 'action' (like + * activation, deactivation, check for remote expiry,...) + * without holding the lock as it is not ensured that value + * will not change. * @groupevt: Next event of the group which is only used when the * group is !active. The group event is then queued into * the parent timer queue. @@ -41,9 +51,8 @@ struct tmigr_event { * @num_children: Counter of group children to make sure the group is only * filled with TMIGR_CHILDREN_PER_GROUP; Required for setup * only - * @childmask: childmask of the group in the parent group; is set - * during setup and will never change; can be read - * lockless + * @groupmask: mask of the group in the parent group; is set during + * setup and will never change; can be read lockless * @list: List head that is added to the per level * tmigr_level_list; is required during setup when a * new group needs to be connected to the existing @@ -59,7 +68,7 @@ struct tmigr_group { unsigned int level; int numa_node; unsigned int num_children; - u8 childmask; + u8 groupmask; struct list_head list; }; @@ -79,7 +88,7 @@ struct tmigr_group { * hierarchy * @remote: Is set when timers of the CPU are expired remotely * @tmgroup: Pointer to the parent group - * @childmask: childmask of tmigr_cpu in the parent group + * @groupmask: mask of tmigr_cpu in the parent group * @wakeup: Stores the first timer when the timer migration * hierarchy is completely idle and remote expiry was done; * is returned to timer code in the idle path and is only @@ -92,7 +101,7 @@ struct tmigr_cpu { bool idle; bool remote; struct tmigr_group *tmgroup; - u8 childmask; + u8 groupmask; u64 wakeup; struct tmigr_event cpuevt; }; @@ -108,8 +117,8 @@ union tmigr_state { u32 state; /** * struct - split state of tmigr_group - * @active: Contains each childmask bit of the active children - * @migrator: Contains childmask of the child which is migrator + * @active: Contains each mask bit of the active children + * @migrator: Contains mask of the child which is migrator * @seq: Sequence counter needs to be increased when an update * to the tmigr_state is done. It prevents a race when * updates in the child groups are propagated in changed diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index fc205ad167a9..d1d5ea2d0a1b 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -902,7 +902,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, i = *idx ? : task->curr_ret_stack; while (i > 0) { - ret_stack = get_ret_stack(current, i, &i); + ret_stack = get_ret_stack(task, i, &i); if (!ret_stack) break; /* diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index cb0871fbdb07..314ffc143039 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -34,8 +34,6 @@ MODULE_PARM_DESC(cpu_affinity, "Cpu num test is running on"); static struct completion done; -#define MIN(x, y) ((x) < (y) ? (x) : (y)) - static void busy_wait(ulong time) { u64 start, end; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 28853966aa9a..cebd879a30cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -693,18 +693,6 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, } /** - * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer - * @buffer: The ring_buffer to get the number of pages from - * @cpu: The cpu of the ring_buffer to get the number of pages from - * - * Returns the number of pages used by a per_cpu buffer of the ring buffer. - */ -size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) -{ - return buffer->buffers[cpu]->nr_pages; -} - -/** * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 10cd38bce2f1..ebe7ce2f5f4a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7956,7 +7956,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, trace_access_unlock(iter->cpu_file); if (ret < 0) { - if (trace_empty(iter)) { + if (trace_empty(iter) && !iter->closed) { if ((filp->f_flags & O_NONBLOCK)) return -EAGAIN; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8783bebd0562..bd3e3069300e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1634,6 +1634,29 @@ static inline void *event_file_data(struct file *filp) extern struct mutex event_mutex; extern struct list_head ftrace_events; +/* + * When the trace_event_file is the filp->i_private pointer, + * it must be taken under the event_mutex lock, and then checked + * if the EVENT_FILE_FL_FREED flag is set. If it is, then the + * data pointed to by the trace_event_file can not be trusted. + * + * Use the event_file_file() to access the trace_event_file from + * the filp the first time under the event_mutex and check for + * NULL. If it is needed to be retrieved again and the event_mutex + * is still held, then the event_file_data() can be used and it + * is guaranteed to be valid. + */ +static inline struct trace_event_file *event_file_file(struct file *filp) +{ + struct trace_event_file *file; + + lockdep_assert_held(&event_mutex); + file = READ_ONCE(file_inode(filp)->i_private); + if (!file || file->flags & EVENT_FILE_FL_FREED) + return NULL; + return file; +} + extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; extern const struct file_operations event_hist_debug_fops; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6ef29eba90ce..7266ec2a4eea 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -992,18 +992,18 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) void event_file_get(struct trace_event_file *file) { - atomic_inc(&file->ref); + refcount_inc(&file->ref); } void event_file_put(struct trace_event_file *file) { - if (WARN_ON_ONCE(!atomic_read(&file->ref))) { + if (WARN_ON_ONCE(!refcount_read(&file->ref))) { if (file->flags & EVENT_FILE_FL_FREED) kmem_cache_free(file_cachep, file); return; } - if (atomic_dec_and_test(&file->ref)) { + if (refcount_dec_and_test(&file->ref)) { /* Count should only go to zero when it is freed */ if (WARN_ON_ONCE(!(file->flags & EVENT_FILE_FL_FREED))) return; @@ -1386,12 +1386,12 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, char buf[4] = "0"; mutex_lock(&event_mutex); - file = event_file_data(filp); + file = event_file_file(filp); if (likely(file)) flags = file->flags; mutex_unlock(&event_mutex); - if (!file || flags & EVENT_FILE_FL_FREED) + if (!file) return -ENODEV; if (flags & EVENT_FILE_FL_ENABLED && @@ -1424,8 +1424,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 1: ret = -ENODEV; mutex_lock(&event_mutex); - file = event_file_data(filp); - if (likely(file && !(file->flags & EVENT_FILE_FL_FREED))) { + file = event_file_file(filp); + if (likely(file)) { ret = tracing_update_buffers(file->tr); if (ret < 0) { mutex_unlock(&event_mutex); @@ -1540,7 +1540,8 @@ enum { static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct trace_event_call *call = event_file_data(m->private); + struct trace_event_file *file = event_file_data(m->private); + struct trace_event_call *call = file->event_call; struct list_head *common_head = &ftrace_common_fields; struct list_head *head = trace_get_fields(call); struct list_head *node = v; @@ -1572,7 +1573,8 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) static int f_show(struct seq_file *m, void *v) { - struct trace_event_call *call = event_file_data(m->private); + struct trace_event_file *file = event_file_data(m->private); + struct trace_event_call *call = file->event_call; struct ftrace_event_field *field; const char *array_descriptor; @@ -1627,12 +1629,14 @@ static int f_show(struct seq_file *m, void *v) static void *f_start(struct seq_file *m, loff_t *pos) { + struct trace_event_file *file; void *p = (void *)FORMAT_HEADER; loff_t l = 0; /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); - if (!event_file_data(m->private)) + file = event_file_file(m->private); + if (!file) return ERR_PTR(-ENODEV); while (l < *pos && p) @@ -1706,8 +1710,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file && !(file->flags & EVENT_FILE_FL_FREED)) + file = event_file_file(filp); + if (file) print_event_filter(file, s); mutex_unlock(&event_mutex); @@ -1736,9 +1740,13 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return PTR_ERR(buf); mutex_lock(&event_mutex); - file = event_file_data(filp); - if (file) - err = apply_event_filter(file, buf); + file = event_file_file(filp); + if (file) { + if (file->flags & EVENT_FILE_FL_FREED) + err = -ENODEV; + else + err = apply_event_filter(file, buf); + } mutex_unlock(&event_mutex); kfree(buf); @@ -2485,7 +2493,6 @@ static int event_callback(const char *name, umode_t *mode, void **data, if (strcmp(name, "format") == 0) { *mode = TRACE_MODE_READ; *fops = &ftrace_event_format_fops; - *data = call; return 1; } @@ -2996,7 +3003,7 @@ trace_create_new_event(struct trace_event_call *call, atomic_set(&file->tm_ref, 0); INIT_LIST_HEAD(&file->triggers); list_add(&file->list, &tr->events); - event_file_get(file); + refcount_set(&file->ref, 1); return file; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6ece1308d36a..5f9119eb7c67 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5601,7 +5601,7 @@ static int hist_show(struct seq_file *m, void *v) mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) { ret = -ENODEV; goto out_unlock; @@ -5880,7 +5880,7 @@ static int hist_debug_show(struct seq_file *m, void *v) mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) { ret = -ENODEV; goto out_unlock; diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index 8650562bdaa9..a8f076809db4 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -299,7 +299,7 @@ event_inject_write(struct file *filp, const char __user *ubuf, size_t cnt, strim(buf); mutex_lock(&event_mutex); - file = event_file_data(filp); + file = event_file_file(filp); if (file) { call = file->event_call; size = parse_entry(buf, call, &entry); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 4bec043c8690..a5e3d6acf1e1 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -159,7 +159,7 @@ static void *trigger_start(struct seq_file *m, loff_t *pos) /* ->stop() is called even if ->start() fails */ mutex_lock(&event_mutex); - event_file = event_file_data(m->private); + event_file = event_file_file(m->private); if (unlikely(!event_file)) return ERR_PTR(-ENODEV); @@ -213,7 +213,7 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) mutex_lock(&event_mutex); - if (unlikely(!event_file_data(file))) { + if (unlikely(!event_file_file(file))) { mutex_unlock(&event_mutex); return -ENODEV; } @@ -293,7 +293,7 @@ static ssize_t event_trigger_regex_write(struct file *file, strim(buf); mutex_lock(&event_mutex); - event_file = event_file_data(file); + event_file = event_file_file(file); if (unlikely(!event_file)) { mutex_unlock(&event_mutex); kfree(buf); diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index a4dcf0f24352..3a56e7c8aa4f 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -454,7 +454,7 @@ static struct tracing_map_elt *get_free_elt(struct tracing_map *map) struct tracing_map_elt *elt = NULL; int idx; - idx = atomic_inc_return(&map->next_elt); + idx = atomic_fetch_add_unless(&map->next_elt, 1, map->max_elts); if (idx < map->max_elts) { elt = *(TRACING_MAP_ELT(map->elts, idx)); if (map->ops && map->ops->elt_init) @@ -699,7 +699,7 @@ void tracing_map_clear(struct tracing_map *map) { unsigned int i; - atomic_set(&map->next_elt, -1); + atomic_set(&map->next_elt, 0); atomic64_set(&map->hits, 0); atomic64_set(&map->drops, 0); @@ -783,7 +783,7 @@ struct tracing_map *tracing_map_create(unsigned int map_bits, map->map_bits = map_bits; map->max_elts = (1 << map_bits); - atomic_set(&map->next_elt, -1); + atomic_set(&map->next_elt, 0); map->map_size = (1 << (map_bits + 1)); map->ops = ops; |