diff options
Diffstat (limited to 'kernel')
215 files changed, 12183 insertions, 6588 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index b3097bde4e9c..34d1e77ee9df 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,7 +1,6 @@ # # Generated files # -config_data.h -config_data.gz +kheaders.md5 timeconst.h hz.bc diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 84d882f3e299..bf770d7556f7 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -229,7 +229,7 @@ config MUTEX_SPIN_ON_OWNER config RWSEM_SPIN_ON_OWNER def_bool y - depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW + depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW config LOCK_SPIN_ON_OWNER def_bool y @@ -242,9 +242,19 @@ config QUEUED_SPINLOCKS def_bool y if ARCH_USE_QUEUED_SPINLOCKS depends on SMP +config BPF_ARCH_SPINLOCK + bool + config ARCH_USE_QUEUED_RWLOCKS bool config QUEUED_RWLOCKS def_bool y if ARCH_USE_QUEUED_RWLOCKS depends on SMP + +config ARCH_HAS_MMIOWB + bool + +config MMIOWB + def_bool y if ARCH_HAS_MMIOWB + depends on SMP diff --git a/kernel/Makefile b/kernel/Makefile index 6aa7543bcdb2..298437bb2c6a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n # Don't self-instrument. KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n +CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -70,6 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -116,17 +118,17 @@ obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n -$(obj)/configs.o: $(obj)/config_data.h +$(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) -filechk_ikconfiggz = \ - echo "static const char kernel_config_data[] __used = MAGIC_START"; \ - cat $< | scripts/bin2c; \ - echo "MAGIC_END;" +$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz -targets += config_data.h -$(obj)/config_data.h: $(obj)/config_data.gz FORCE - $(call filechk,ikconfiggz) +quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz +cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ +$(obj)/kheaders_data.tar.xz: FORCE + $(call cmd,genikh) + +clean-files := kheaders_data.tar.xz kheaders.md5 diff --git a/kernel/acct.c b/kernel/acct.c index addf7732fb56..81f9831a7859 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return PTR_ERR(internal); } - err = mnt_want_write(internal); + err = __mnt_want_write(internal); if (err) { mntput(internal); kfree(acct); @@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname) old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - mnt_drop_write(mnt); + __mnt_drop_write(mnt); mntput(mnt); return 0; } diff --git a/kernel/async.c b/kernel/async.c index a893d6170944..12c332e4e13e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work) /* 1) run (and print duration) */ if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("calling %lli_%pF @ %i\n", + pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); @@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", + pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); @@ -149,7 +149,25 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain) +/** + * async_schedule_node_domain - NUMA specific version of async_schedule_domain + * @func: function to execute asynchronously + * @data: data pointer to pass to the function + * @node: NUMA node that we want to schedule this on or close to + * @domain: the domain + * + * Returns an async_cookie_t that may be used for checkpointing later. + * @domain may be used in the async_synchronize_*_domain() functions to + * wait within a certain synchronization domain rather than globally. + * + * Note: This function may be called from atomic or non-atomic contexts. + * + * The node requested will be honored on a best effort basis. If the node + * has no CPUs associated with it then the work is distributed among all + * available CPUs. + */ +async_cookie_t async_schedule_node_domain(async_func_t func, void *data, + int node, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; @@ -195,43 +213,30 @@ static async_cookie_t __async_schedule(async_func_t func, void *data, struct asy current->flags |= PF_USED_ASYNC; /* schedule for execution */ - queue_work(system_unbound_wq, &entry->work); + queue_work_node(node, system_unbound_wq, &entry->work); return newcookie; } +EXPORT_SYMBOL_GPL(async_schedule_node_domain); /** - * async_schedule - schedule a function for asynchronous execution + * async_schedule_node - NUMA specific version of async_schedule * @func: function to execute asynchronously * @data: data pointer to pass to the function + * @node: NUMA node that we want to schedule this on or close to * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. - */ -async_cookie_t async_schedule(async_func_t func, void *data) -{ - return __async_schedule(func, data, &async_dfl_domain); -} -EXPORT_SYMBOL_GPL(async_schedule); - -/** - * async_schedule_domain - schedule a function for asynchronous execution within a certain domain - * @func: function to execute asynchronously - * @data: data pointer to pass to the function - * @domain: the domain * - * Returns an async_cookie_t that may be used for checkpointing later. - * @domain may be used in the async_synchronize_*_domain() functions to - * wait within a certain synchronization domain rather than globally. A - * synchronization domain is specified via @domain. Note: This function - * may be called from atomic or non-atomic contexts. + * The node requested will be honored on a best effort basis. If the node + * has no CPUs associated with it then the work is distributed among all + * available CPUs. */ -async_cookie_t async_schedule_domain(async_func_t func, void *data, - struct async_domain *domain) +async_cookie_t async_schedule_node(async_func_t func, void *data, int node) { - return __async_schedule(func, data, domain); + return async_schedule_node_domain(func, data, node, &async_dfl_domain); } -EXPORT_SYMBOL_GPL(async_schedule_domain); +EXPORT_SYMBOL_GPL(async_schedule_node); /** * async_synchronize_full - synchronize all asynchronous function calls diff --git a/kernel/audit.c b/kernel/audit.c index 632d36059556..c89ea48c70a6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -396,10 +396,10 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old, struct audit_buffer *ab; int rc = 0; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; - audit_log_format(ab, "%s=%u old=%u ", function_name, new, old); + audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) @@ -1053,7 +1053,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return err; } -static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) +static void audit_log_common_recv_msg(struct audit_context *context, + struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); @@ -1063,7 +1064,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) return; } - *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); + *ab = audit_log_start(context, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); @@ -1071,6 +1072,12 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) audit_log_task_context(*ab); } +static inline void audit_log_user_recv_msg(struct audit_buffer **ab, + u16 msg_type) +{ + audit_log_common_recv_msg(NULL, ab, msg_type); +} + int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); @@ -1338,7 +1345,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) break; } - audit_log_common_recv_msg(&ab, msg_type); + audit_log_user_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, @@ -1361,8 +1368,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); + audit_log_format(ab, " op=%s audit_enabled=%d res=0", + msg_type == AUDIT_ADD_RULE ? + "add_rule" : "remove_rule", + audit_enabled); audit_log_end(ab); return -EPERM; } @@ -1373,7 +1384,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_TRIM: audit_trim_trees(); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; @@ -1403,8 +1415,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* OK, here comes... */ err = audit_tag_tree(old, new); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); - + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); audit_log_format(ab, " new="); @@ -1471,7 +1483,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) old.enabled = t & AUDIT_TTY_ENABLE; old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" " old-log_passwd=%d new-log_passwd=%d res=%d", old.enabled, s.enabled, old.log_passwd, @@ -2054,153 +2067,6 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) -{ - int i; - - if (cap_isclear(*cap)) { - audit_log_format(ab, " %s=0", prefix); - return; - } - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) - audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); -} - -static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) -{ - audit_log_cap(ab, "cap_fp", &name->fcap.permitted); - audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); - audit_log_format(ab, " cap_fe=%d cap_fver=%x", - name->fcap.fE, name->fcap_ver); -} - -static inline int audit_copy_fcaps(struct audit_names *name, - const struct dentry *dentry) -{ - struct cpu_vfs_cap_data caps; - int rc; - - if (!dentry) - return 0; - - rc = get_vfs_caps_from_disk(dentry, &caps); - if (rc) - return rc; - - name->fcap.permitted = caps.permitted; - name->fcap.inheritable = caps.inheritable; - name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); - name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> - VFS_CAP_REVISION_SHIFT; - - return 0; -} - -/* Copy inode data into an audit_names. */ -void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - struct inode *inode) -{ - name->ino = inode->i_ino; - name->dev = inode->i_sb->s_dev; - name->mode = inode->i_mode; - name->uid = inode->i_uid; - name->gid = inode->i_gid; - name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); - audit_copy_fcaps(name, dentry); -} - -/** - * audit_log_name - produce AUDIT_PATH record from struct audit_names - * @context: audit_context for the task - * @n: audit_names structure with reportable details - * @path: optional path to report instead of audit_names->name - * @record_num: record number to report when handling a list of names - * @call_panic: optional pointer to int that will be updated if secid fails - */ -void audit_log_name(struct audit_context *context, struct audit_names *n, - const struct path *path, int record_num, int *call_panic) -{ - struct audit_buffer *ab; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - return; - - audit_log_format(ab, "item=%d", record_num); - - if (path) - audit_log_d_path(ab, " name=", path); - else if (n->name) { - switch (n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != AUDIT_INO_UNSET) - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - from_kuid(&init_user_ns, n->uid), - from_kgid(&init_user_ns, n->gid), - MAJOR(n->rdev), - MINOR(n->rdev)); - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - /* log the audit_names record type */ - switch(n->type) { - case AUDIT_TYPE_NORMAL: - audit_log_format(ab, " nametype=NORMAL"); - break; - case AUDIT_TYPE_PARENT: - audit_log_format(ab, " nametype=PARENT"); - break; - case AUDIT_TYPE_CHILD_DELETE: - audit_log_format(ab, " nametype=DELETE"); - break; - case AUDIT_TYPE_CHILD_CREATE: - audit_log_format(ab, " nametype=CREATE"); - break; - default: - audit_log_format(ab, " nametype=UNKNOWN"); - break; - } - - audit_log_fcaps(ab, n); - audit_log_end(ab); -} - int audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; @@ -2322,6 +2188,91 @@ void audit_log_link_denied(const char *operation) audit_log_end(ab); } +/* global counter which is incremented every time something logs in */ +static atomic_t session_id = ATOMIC_INIT(0); + +static int audit_set_loginuid_perm(kuid_t loginuid) +{ + /* if we are unset, we don't need privs */ + if (!audit_loginuid_set(current)) + return 0; + /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ + if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) + return -EPERM; + /* it is set, you need permission */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; + /* reject if this is not an unset and we don't allow that */ + if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) + && uid_valid(loginuid)) + return -EPERM; + return 0; +} + +static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, + unsigned int oldsessionid, + unsigned int sessionid, int rc) +{ + struct audit_buffer *ab; + uid_t uid, oldloginuid, loginuid; + struct tty_struct *tty; + + if (!audit_enabled) + return; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (!ab) + return; + + uid = from_kuid(&init_user_ns, task_uid(current)); + oldloginuid = from_kuid(&init_user_ns, koldloginuid); + loginuid = from_kuid(&init_user_ns, kloginuid), + tty = audit_get_tty(); + + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); + audit_log_task_context(ab); + audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", + oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", + oldsessionid, sessionid, !rc); + audit_put_tty(tty); + audit_log_end(ab); +} + +/** + * audit_set_loginuid - set current task's loginuid + * @loginuid: loginuid value + * + * Returns 0. + * + * Called (set) from fs/proc/base.c::proc_loginuid_write(). + */ +int audit_set_loginuid(kuid_t loginuid) +{ + unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; + kuid_t oldloginuid; + int rc; + + oldloginuid = audit_get_loginuid(current); + oldsessionid = audit_get_sessionid(current); + + rc = audit_set_loginuid_perm(loginuid); + if (rc) + goto out; + + /* are we setting or clearing? */ + if (uid_valid(loginuid)) { + sessionid = (unsigned int)atomic_inc_return(&session_id); + if (unlikely(sessionid == AUDIT_SID_UNSET)) + sessionid = (unsigned int)atomic_inc_return(&session_id); + } + + current->sessionid = sessionid; + current->loginuid = loginuid; +out: + audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); + return rc; +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer diff --git a/kernel/audit.h b/kernel/audit.h index 91421679a168..958d5b8fc1b3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -69,6 +69,7 @@ struct audit_cap_data { kernel_cap_t effective; /* effective set of process */ }; kernel_cap_t ambient; + kuid_t rootid; }; /* When fs/namei.c:getname() is called, we store the pointer in name and bump @@ -212,15 +213,6 @@ extern bool audit_ever_enabled; extern void audit_log_session_info(struct audit_buffer *ab); -extern void audit_copy_inode(struct audit_names *name, - const struct dentry *dentry, - struct inode *inode); -extern void audit_log_cap(struct audit_buffer *ab, char *prefix, - kernel_cap_t *cap); -extern void audit_log_name(struct audit_context *context, - struct audit_names *n, const struct path *path, - int record_num, int *call_panic); - extern int auditd_test_task(struct task_struct *task); #define AUDIT_INODE_BUCKETS 32 @@ -267,25 +259,52 @@ extern void audit_log_d_path_exe(struct audit_buffer *ab, extern struct tty_struct *audit_get_tty(void); extern void audit_put_tty(struct tty_struct *tty); -/* audit watch functions */ +/* audit watch/mark/tree functions */ #ifdef CONFIG_AUDITSYSCALL +extern unsigned int audit_serial(void); +extern int auditsc_get_stamp(struct audit_context *ctx, + struct timespec64 *t, unsigned int *serial); + extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); -extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); +extern int audit_to_watch(struct audit_krule *krule, char *path, int len, + u32 op); extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); -extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); +extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, + dev_t dev); -extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); +extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, + char *pathname, int len); extern char *audit_mark_path(struct audit_fsnotify_mark *mark); extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); extern void audit_remove_mark_rule(struct audit_krule *krule); -extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, + unsigned long ino, dev_t dev); extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); -extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); +extern int audit_exe_compare(struct task_struct *tsk, + struct audit_fsnotify_mark *mark); + +extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); +extern void audit_put_chunk(struct audit_chunk *chunk); +extern bool audit_tree_match(struct audit_chunk *chunk, + struct audit_tree *tree); +extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); +extern int audit_add_tree_rule(struct audit_krule *rule); +extern int audit_remove_tree_rule(struct audit_krule *rule); +extern void audit_trim_trees(void); +extern int audit_tag_tree(char *old, char *new); +extern const char *audit_tree_path(struct audit_tree *tree); +extern void audit_put_tree(struct audit_tree *tree); +extern void audit_kill_trees(struct audit_context *context); -#else +extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_filter_inodes(struct task_struct *tsk, + struct audit_context *ctx); +extern struct list_head *audit_killed_trees(void); +#else /* CONFIG_AUDITSYSCALL */ +#define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} #define audit_get_watch(w) {} #define audit_to_watch(k, p, l, o) (-EINVAL) @@ -301,21 +320,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark #define audit_mark_compare(m, i, d) 0 #define audit_exe_compare(t, m) (-EINVAL) #define audit_dupe_exe(n, o) (-EINVAL) -#endif /* CONFIG_AUDITSYSCALL */ -#ifdef CONFIG_AUDITSYSCALL -extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); -extern void audit_put_chunk(struct audit_chunk *chunk); -extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); -extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); -extern int audit_add_tree_rule(struct audit_krule *rule); -extern int audit_remove_tree_rule(struct audit_krule *rule); -extern void audit_trim_trees(void); -extern int audit_tag_tree(char *old, char *new); -extern const char *audit_tree_path(struct audit_tree *tree); -extern void audit_put_tree(struct audit_tree *tree); -extern void audit_kill_trees(struct list_head *list); -#else #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL #define audit_make_tree(rule, str, op) -EINVAL @@ -323,8 +328,11 @@ extern void audit_kill_trees(struct list_head *list); #define audit_put_tree(tree) (void)0 #define audit_tag_tree(old, new) -EINVAL #define audit_tree_path(rule) "" /* never called */ -#define audit_kill_trees(list) BUG() -#endif +#define audit_kill_trees(context) BUG() + +#define audit_signal_info(s, t) AUDIT_DISABLED +#define audit_filter_inodes(t, c) AUDIT_DISABLED +#endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); @@ -334,14 +342,5 @@ extern u32 audit_sig_sid; extern int audit_filter(int msgtype, unsigned int listtype); -#ifdef CONFIG_AUDITSYSCALL -extern int audit_signal_info(int sig, struct task_struct *t); -extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); -extern struct list_head *audit_killed_trees(void); -#else -#define audit_signal_info(s,t) AUDIT_DISABLED -#define audit_filter_inodes(t,c) AUDIT_DISABLED -#endif - extern void audit_ctl_lock(void); extern void audit_ctl_unlock(void); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index cf4512a33675..37ae95cfb7f4 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -127,7 +127,7 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_session_info(ab); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index d4af4d97f847..abfb112f26aa 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -524,13 +524,14 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static void audit_tree_log_remove_rule(struct audit_krule *rule) +static void audit_tree_log_remove_rule(struct audit_context *context, + struct audit_krule *rule) { struct audit_buffer *ab; if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_format(ab, "op=remove_rule dir="); @@ -540,7 +541,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) audit_log_end(ab); } -static void kill_rules(struct audit_tree *tree) +static void kill_rules(struct audit_context *context, struct audit_tree *tree) { struct audit_krule *rule, *next; struct audit_entry *entry; @@ -551,7 +552,7 @@ static void kill_rules(struct audit_tree *tree) list_del_init(&rule->rlist); if (rule->tree) { /* not a half-baked one */ - audit_tree_log_remove_rule(rule); + audit_tree_log_remove_rule(context, rule); if (entry->rule.exe) audit_remove_mark(entry->rule.exe); rule->tree = NULL; @@ -633,7 +634,7 @@ static void trim_marked(struct audit_tree *tree) tree->goner = 1; spin_unlock(&hash_lock); mutex_lock(&audit_filter_mutex); - kill_rules(tree); + kill_rules(audit_context(), tree); list_del_init(&tree->list); mutex_unlock(&audit_filter_mutex); prune_one(tree); @@ -973,8 +974,10 @@ static void audit_schedule_prune(void) * ... and that one is done if evict_chunk() decides to delay until the end * of syscall. Runs synchronously. */ -void audit_kill_trees(struct list_head *list) +void audit_kill_trees(struct audit_context *context) { + struct list_head *list = &context->killed_trees; + audit_ctl_lock(); mutex_lock(&audit_filter_mutex); @@ -982,7 +985,7 @@ void audit_kill_trees(struct list_head *list) struct audit_tree *victim; victim = list_entry(list->next, struct audit_tree, list); - kill_rules(victim); + kill_rules(context, victim); list_del_init(&victim->list); mutex_unlock(&audit_filter_mutex); @@ -1017,7 +1020,7 @@ static void evict_chunk(struct audit_chunk *chunk) list_del_init(&owner->same_root); spin_unlock(&hash_lock); if (!postponed) { - kill_rules(owner); + kill_rules(audit_context(), owner); list_move(&owner->list, &prune_list); need_prune = 1; } else { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 20ef9ba134b0..e8d1adeb2223 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -242,7 +242,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE); if (!ab) return; audit_log_session_info(ab); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index bf309f2592c4..63f8b3f26fab 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -670,7 +670,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->values[i] = AUDIT_UID_UNSET; break; } - /* fallthrough if set */ + /* fall through - if set */ default: data->values[i] = f->val; } @@ -1091,7 +1091,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; audit_log_session_info(ab); @@ -1355,7 +1355,7 @@ int audit_filter(int msgtype, unsigned int listtype) if (f->lsm_rule) { security_task_getsecid(current, &sid); result = security_audit_rule_match(sid, - f->type, f->op, f->lsm_rule, NULL); + f->type, f->op, f->lsm_rule); } break; case AUDIT_EXE: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 6593a5207fb0..d1eab1d4a930 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -631,9 +631,8 @@ static int audit_filter_rules(struct task_struct *tsk, need_sid = 0; } result = security_audit_rule_match(sid, f->type, - f->op, - f->lsm_rule, - ctx); + f->op, + f->lsm_rule); } break; case AUDIT_OBJ_USER: @@ -647,13 +646,17 @@ static int audit_filter_rules(struct task_struct *tsk, /* Find files that match */ if (name) { result = security_audit_rule_match( - name->osid, f->type, f->op, - f->lsm_rule, ctx); + name->osid, + f->type, + f->op, + f->lsm_rule); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (security_audit_rule_match(n->osid, f->type, - f->op, f->lsm_rule, - ctx)) { + if (security_audit_rule_match( + n->osid, + f->type, + f->op, + f->lsm_rule)) { ++result; break; } @@ -664,7 +667,7 @@ static int audit_filter_rules(struct task_struct *tsk, break; if (security_audit_rule_match(ctx->ipc.osid, f->type, f->op, - f->lsm_rule, ctx)) + f->lsm_rule)) ++result; } break; @@ -1136,6 +1139,32 @@ out: kfree(buf_head); } +void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +{ + int i; + + if (cap_isclear(*cap)) { + audit_log_format(ab, " %s=0", prefix); + return; + } + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) + audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); +} + +static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +{ + if (name->fcap_ver == -1) { + audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?"); + return; + } + audit_log_cap(ab, "cap_fp", &name->fcap.permitted); + audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); + audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d", + name->fcap.fE, name->fcap_ver, + from_kuid(&init_user_ns, name->fcap.rootid)); +} + static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1258,6 +1287,97 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len) return len; } +/* + * audit_log_name - produce AUDIT_PATH record from struct audit_names + * @context: audit_context for the task + * @n: audit_names structure with reportable details + * @path: optional path to report instead of audit_names->name + * @record_num: record number to report when handling a list of names + * @call_panic: optional pointer to int that will be updated if secid fails + */ +static void audit_log_name(struct audit_context *context, struct audit_names *n, + const struct path *path, int record_num, int *call_panic) +{ + struct audit_buffer *ab; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; + + audit_log_format(ab, "item=%d", record_num); + + if (path) + audit_log_d_path(ab, " name=", path); + else if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd + */ + audit_log_d_path(ab, " name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != AUDIT_INO_UNSET) + audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + from_kuid(&init_user_ns, n->uid), + from_kgid(&init_user_ns, n->gid), + MAJOR(n->rdev), + MINOR(n->rdev)); + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + if (call_panic) + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + /* log the audit_names record type */ + switch (n->type) { + case AUDIT_TYPE_NORMAL: + audit_log_format(ab, " nametype=NORMAL"); + break; + case AUDIT_TYPE_PARENT: + audit_log_format(ab, " nametype=PARENT"); + break; + case AUDIT_TYPE_CHILD_DELETE: + audit_log_format(ab, " nametype=DELETE"); + break; + case AUDIT_TYPE_CHILD_CREATE: + audit_log_format(ab, " nametype=CREATE"); + break; + default: + audit_log_format(ab, " nametype=UNKNOWN"); + break; + } + + audit_log_fcaps(ab, n); + audit_log_end(ab); +} + static void audit_log_proctitle(void) { int res; @@ -1358,6 +1478,9 @@ static void audit_log_exit(void) audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); audit_log_cap(ab, "pe", &axs->new_pcap.effective); audit_log_cap(ab, "pa", &axs->new_pcap.ambient); + audit_log_format(ab, " frootid=%d", + from_kuid(&init_user_ns, + axs->fcap.rootid)); break; } } @@ -1444,6 +1567,9 @@ void __audit_free(struct task_struct *tsk) if (!context) return; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(context); + /* We are called either by do_exit() or the fork() error handling code; * in the former case tsk == current and in the latter tsk is a * random task_struct that doesn't doesn't have any meaningful data we @@ -1460,9 +1586,6 @@ void __audit_free(struct task_struct *tsk) audit_log_exit(); } - if (!list_empty(&context->killed_trees)) - audit_kill_trees(&context->killed_trees); - audit_set_context(tsk, NULL); audit_free_context(context); } @@ -1537,6 +1660,9 @@ void __audit_syscall_exit(int success, long return_code) if (!context) return; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(context); + if (!context->dummy && context->in_syscall) { if (success) context->return_valid = AUDITSC_SUCCESS; @@ -1571,9 +1697,6 @@ void __audit_syscall_exit(int success, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; - if (!list_empty(&context->killed_trees)) - audit_kill_trees(&context->killed_trees); - audit_free_names(context); unroll_tree_refs(context, NULL, 0); audit_free_aux(context); @@ -1750,6 +1873,47 @@ void __audit_getname(struct filename *name) get_fs_pwd(current->fs, &context->pwd); } +static inline int audit_copy_fcaps(struct audit_names *name, + const struct dentry *dentry) +{ + struct cpu_vfs_cap_data caps; + int rc; + + if (!dentry) + return 0; + + rc = get_vfs_caps_from_disk(dentry, &caps); + if (rc) + return rc; + + name->fcap.permitted = caps.permitted; + name->fcap.inheritable = caps.inheritable; + name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + name->fcap.rootid = caps.rootid; + name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> + VFS_CAP_REVISION_SHIFT; + + return 0; +} + +/* Copy inode data into an audit_names. */ +void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, + struct inode *inode, unsigned int flags) +{ + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + security_inode_getsecid(inode, &name->osid); + if (flags & AUDIT_INODE_NOEVAL) { + name->fcap_ver = -1; + return; + } + audit_copy_fcaps(name, dentry); +} + /** * __audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1763,10 +1927,31 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; + struct audit_entry *e; + struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; + int i; if (!context->in_syscall) return; + rcu_read_lock(); + if (!list_empty(list)) { + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(inode->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; + } + } + } + } + rcu_read_unlock(); + if (!name) goto out_alloc; @@ -1832,7 +2017,7 @@ out: n->type = AUDIT_TYPE_NORMAL; } handle_path(dentry); - audit_copy_inode(n, dentry, inode); + audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL); } void __audit_file(const struct file *file) @@ -1875,14 +2060,12 @@ void __audit_inode_child(struct inode *parent, for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; - if (f->type == AUDIT_FSTYPE) { - if (audit_comparator(parent->i_sb->s_magic, - f->op, f->val)) { - if (e->rule.action == AUDIT_NEVER) { - rcu_read_unlock(); - return; - } - } + if (f->type == AUDIT_FSTYPE + && audit_comparator(parent->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; } } } @@ -1933,7 +2116,7 @@ void __audit_inode_child(struct inode *parent, n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; - audit_copy_inode(n, NULL, parent); + audit_copy_inode(n, NULL, parent, 0); } if (!found_child) { @@ -1952,7 +2135,7 @@ void __audit_inode_child(struct inode *parent, } if (inode) - audit_copy_inode(found_child, dentry, inode); + audit_copy_inode(found_child, dentry, inode, 0); else found_child->ino = AUDIT_INO_UNSET; } @@ -1983,90 +2166,6 @@ int auditsc_get_stamp(struct audit_context *ctx, return 1; } -/* global counter which is incremented every time something logs in */ -static atomic_t session_id = ATOMIC_INIT(0); - -static int audit_set_loginuid_perm(kuid_t loginuid) -{ - /* if we are unset, we don't need privs */ - if (!audit_loginuid_set(current)) - return 0; - /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ - if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) - return -EPERM; - /* it is set, you need permission */ - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; - /* reject if this is not an unset and we don't allow that */ - if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) - return -EPERM; - return 0; -} - -static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, - unsigned int oldsessionid, unsigned int sessionid, - int rc) -{ - struct audit_buffer *ab; - uid_t uid, oldloginuid, loginuid; - struct tty_struct *tty; - - if (!audit_enabled) - return; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (!ab) - return; - - uid = from_kuid(&init_user_ns, task_uid(current)); - oldloginuid = from_kuid(&init_user_ns, koldloginuid); - loginuid = from_kuid(&init_user_ns, kloginuid), - tty = audit_get_tty(); - - audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); - audit_log_task_context(ab); - audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", - oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", - oldsessionid, sessionid, !rc); - audit_put_tty(tty); - audit_log_end(ab); -} - -/** - * audit_set_loginuid - set current task's audit_context loginuid - * @loginuid: loginuid value - * - * Returns 0. - * - * Called (set) from fs/proc/base.c::proc_loginuid_write(). - */ -int audit_set_loginuid(kuid_t loginuid) -{ - unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; - kuid_t oldloginuid; - int rc; - - oldloginuid = audit_get_loginuid(current); - oldsessionid = audit_get_sessionid(current); - - rc = audit_set_loginuid_perm(loginuid); - if (rc) - goto out; - - /* are we setting or clearing? */ - if (uid_valid(loginuid)) { - sessionid = (unsigned int)atomic_inc_return(&session_id); - if (unlikely(sessionid == AUDIT_SID_UNSET)) - sessionid = (unsigned int)atomic_inc_return(&session_id); - } - - current->sessionid = sessionid; - current->loginuid = loginuid; -out: - audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); - return rc; -} - /** * __audit_mq_open - record audit data for a POSIX MQ open * @oflag: open flag @@ -2355,6 +2454,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + ax->fcap.rootid = vcaps.rootid; ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; ax->old_pcap.permitted = old->cap_permitted; diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index 1323360d90e3..a563c8fdad0d 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -48,19 +48,14 @@ static void backtrace_test_irq(void) #ifdef CONFIG_STACKTRACE static void backtrace_test_saved(void) { - struct stack_trace trace; unsigned long entries[8]; + unsigned int nr_entries; pr_info("Testing a saved backtrace.\n"); pr_info("The following trace is a kernel self test and not a bug!\n"); - trace.nr_entries = 0; - trace.max_entries = ARRAY_SIZE(entries); - trace.entries = entries; - trace.skip = 0; - - save_stack_trace(&trace); - print_stack_trace(&trace, 0); + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + stack_trace_print(entries, nr_entries, 0); } #else static void backtrace_test_saved(void) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 25632a75d630..c72e0d8e1e65 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -253,8 +253,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; + char *val; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -262,17 +263,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) + if (unlikely(map_flags & BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + if (unlikely((map_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map))) + return -EINVAL; + + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); - else - memcpy(array->value + - array->elem_size * (index & array->index_mask), - value, map->value_size); + } else { + val = array->value + + array->elem_size * (index & array->index_mask); + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, val, value, false); + else + copy_map_value(map, val, value); + } return 0; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c57bd10340ed..bd3921b1514b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -157,7 +157,7 @@ * */ -#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) +#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2) #define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) #define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) #define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) @@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t) return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; } +static bool __btf_type_is_struct(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; +} + static bool btf_type_is_array(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; @@ -525,7 +530,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) /* * Regular int is not a bit field and it must be either - * u8/u16/u32/u64. + * u8/u16/u32/u64 or __int128. */ static bool btf_type_int_is_regular(const struct btf_type *t) { @@ -538,7 +543,8 @@ static bool btf_type_int_is_regular(const struct btf_type *t) if (BITS_PER_BYTE_MASKED(nr_bits) || BTF_INT_OFFSET(int_data) || (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && + nr_bytes != (2 * sizeof(u64)))) { return false; } @@ -1063,9 +1069,9 @@ static int btf_int_check_member(struct btf_verifier_env *env, nr_copy_bits = BTF_INT_BITS(int_data) + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U64) { + if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); + "nr_copy_bits exceeds 128"); return -EINVAL; } @@ -1119,9 +1125,9 @@ static int btf_int_check_kflag_member(struct btf_verifier_env *env, bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U64) { + if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); + "nr_copy_bits exceeds 128"); return -EINVAL; } @@ -1168,9 +1174,9 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); - if (nr_bits > BITS_PER_U64) { + if (nr_bits > BITS_PER_U128) { btf_verifier_log_type(env, t, "nr_bits exceeds %zu", - BITS_PER_U64); + BITS_PER_U128); return -EINVAL; } @@ -1211,31 +1217,93 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } +static void btf_int128_print(struct seq_file *m, void *data) +{ + /* data points to a __int128 number. + * Suppose + * int128_num = *(__int128 *)data; + * The below formulas shows what upper_num and lower_num represents: + * upper_num = int128_num >> 64; + * lower_num = int128_num & 0xffffffffFFFFFFFFULL; + */ + u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = *(u64 *)data; + lower_num = *(u64 *)(data + 8); +#else + upper_num = *(u64 *)(data + 8); + lower_num = *(u64 *)data; +#endif + if (upper_num == 0) + seq_printf(m, "0x%llx", lower_num); + else + seq_printf(m, "0x%llx%016llx", upper_num, lower_num); +} + +static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, + u16 right_shift_bits) +{ + u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = print_num[0]; + lower_num = print_num[1]; +#else + upper_num = print_num[1]; + lower_num = print_num[0]; +#endif + + /* shake out un-needed bits by shift/or operations */ + if (left_shift_bits >= 64) { + upper_num = lower_num << (left_shift_bits - 64); + lower_num = 0; + } else { + upper_num = (upper_num << left_shift_bits) | + (lower_num >> (64 - left_shift_bits)); + lower_num = lower_num << left_shift_bits; + } + + if (right_shift_bits >= 64) { + lower_num = upper_num >> (right_shift_bits - 64); + upper_num = 0; + } else { + lower_num = (lower_num >> right_shift_bits) | + (upper_num << (64 - right_shift_bits)); + upper_num = upper_num >> right_shift_bits; + } + +#ifdef __BIG_ENDIAN_BITFIELD + print_num[0] = upper_num; + print_num[1] = lower_num; +#else + print_num[0] = lower_num; + print_num[1] = upper_num; +#endif +} + static void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_bits, struct seq_file *m) { u16 left_shift_bits, right_shift_bits; u8 nr_copy_bytes; u8 nr_copy_bits; - u64 print_num; + u64 print_num[2] = {}; nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num = 0; - memcpy(&print_num, data, nr_copy_bytes); + memcpy(print_num, data, nr_copy_bytes); #ifdef __BIG_ENDIAN_BITFIELD left_shift_bits = bits_offset; #else - left_shift_bits = BITS_PER_U64 - nr_copy_bits; + left_shift_bits = BITS_PER_U128 - nr_copy_bits; #endif - right_shift_bits = BITS_PER_U64 - nr_bits; + right_shift_bits = BITS_PER_U128 - nr_bits; - print_num <<= left_shift_bits; - print_num >>= right_shift_bits; - - seq_printf(m, "0x%llx", print_num); + btf_int128_shift(print_num, left_shift_bits, right_shift_bits); + btf_int128_print(m, print_num); } @@ -1250,7 +1318,7 @@ static void btf_int_bits_seq_show(const struct btf *btf, /* * bits_offset is at most 7. - * BTF_INT_OFFSET() cannot exceed 64 bits. + * BTF_INT_OFFSET() cannot exceed 128 bits. */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); @@ -1274,6 +1342,9 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, } switch (nr_bits) { + case 128: + btf_int128_print(m, data); + break; case 64: if (sign) seq_printf(m, "%lld", *(s64 *)data); @@ -1980,6 +2051,43 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +/* find 'struct bpf_spin_lock' in map value. + * return >= 0 offset if found + * and < 0 in case of error + */ +int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +{ + const struct btf_member *member; + u32 i, off = -ENOENT; + + if (!__btf_type_is_struct(t)) + return -EINVAL; + + for_each_member(i, t, member) { + const struct btf_type *member_type = btf_type_by_id(btf, + member->type); + if (!__btf_type_is_struct(member_type)) + continue; + if (member_type->size != sizeof(struct bpf_spin_lock)) + continue; + if (strcmp(__btf_name_by_offset(btf, member_type->name_off), + "bpf_spin_lock")) + continue; + if (off != -ENOENT) + /* only one 'struct bpf_spin_lock' is allowed */ + return -E2BIG; + off = btf_member_bit_offset(t, member); + if (off % 8) + /* valid C code cannot generate such BTF */ + return -EINVAL; + off /= 8; + if (off % __alignof__(struct bpf_spin_lock)) + /* valid struct bpf_spin_lock will be 4 byte aligned */ + return -EINVAL; + } + return off; +} + static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index d17d05570a3f..4e807973aa80 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -230,6 +230,7 @@ cleanup: * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach * @type: Type of attach operation + * @flags: Option flags * * Must be called with cgroup_mutex held. */ @@ -363,7 +364,7 @@ cleanup: * Must be called with cgroup_mutex held. */ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 unused_flags) + enum bpf_attach_type type) { struct list_head *progs = &cgrp->bpf.progs[type]; enum bpf_cgroup_storage_type stype; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f908b9356025..c605397c79f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -78,7 +78,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } -struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; @@ -104,6 +104,32 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) return fp; } + +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + struct bpf_prog *prog; + int cpu; + + prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); + if (!prog) + return NULL; + + prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->aux->stats) { + kfree(prog->aux); + vfree(prog); + return NULL; + } + + for_each_possible_cpu(cpu) { + struct bpf_prog_stats *pstats; + + pstats = per_cpu_ptr(prog->aux->stats, cpu); + u64_stats_init(&pstats->syncp); + } + return prog; +} EXPORT_SYMBOL_GPL(bpf_prog_alloc); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) @@ -231,7 +257,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->aux); + if (fp->aux) { + free_percpu(fp->aux->stats); + kfree(fp->aux); + } vfree(fp); } @@ -307,15 +336,16 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, - u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, u32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; + s32 delta = end_new - end_old; s64 imm = insn->imm; - if (curr < pos && curr + imm + 1 > pos) + if (curr < pos && curr + imm + 1 >= end_old) imm += delta; - else if (curr > pos + delta && curr + imm + 1 <= pos + delta) + else if (curr >= end_new && curr + imm + 1 < end_new) imm -= delta; if (imm < imm_min || imm > imm_max) return -ERANGE; @@ -324,15 +354,16 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, return 0; } -static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, - u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, u32 curr, const bool probe_pass) { const s32 off_min = S16_MIN, off_max = S16_MAX; + s32 delta = end_new - end_old; s32 off = insn->off; - if (curr < pos && curr + off + 1 > pos) + if (curr < pos && curr + off + 1 >= end_old) off += delta; - else if (curr > pos + delta && curr + off + 1 <= pos + delta) + else if (curr >= end_new && curr + off + 1 < end_new) off -= delta; if (off < off_min || off > off_max) return -ERANGE; @@ -341,10 +372,10 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, return 0; } -static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, - const bool probe_pass) +static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, + s32 end_new, const bool probe_pass) { - u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0); + u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0); struct bpf_insn *insn = prog->insnsi; int ret = 0; @@ -356,22 +387,23 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, * do any other adjustments. Therefore skip the patchlet. */ if (probe_pass && i == pos) { - i += delta + 1; - insn++; + i = end_new; + insn = prog->insnsi + end_old; } code = insn->code; - if (BPF_CLASS(code) != BPF_JMP || + if ((BPF_CLASS(code) != BPF_JMP && + BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_EXIT) continue; /* Adjust offset of jmps if we cross patch boundaries. */ if (BPF_OP(code) == BPF_CALL) { if (insn->src_reg != BPF_PSEUDO_CALL) continue; - ret = bpf_adj_delta_to_imm(insn, pos, delta, i, - probe_pass); + ret = bpf_adj_delta_to_imm(insn, pos, end_old, + end_new, i, probe_pass); } else { - ret = bpf_adj_delta_to_off(insn, pos, delta, i, - probe_pass); + ret = bpf_adj_delta_to_off(insn, pos, end_old, + end_new, i, probe_pass); } if (ret) break; @@ -421,7 +453,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && - bpf_adj_branches(prog, off, insn_delta, true)) + bpf_adj_branches(prog, off, off + 1, off + len, true)) return NULL; /* Several new instructions need to be inserted. Make room @@ -453,13 +485,25 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * the ship has sailed to reverse to the original state. An * overflow cannot happen at this point. */ - BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); + BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false)); bpf_adj_linfo(prog_adj, off, insn_delta); return prog_adj; } +int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) +{ + /* Branch offsets can't overflow when program is shrinking, no need + * to call bpf_adj_branches(..., true) here + */ + memmove(prog->insnsi + off, prog->insnsi + off + cnt, + sizeof(struct bpf_insn) * (prog->len - off - cnt)); + prog->len -= cnt; + + return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); +} + void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; @@ -495,7 +539,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, *symbol_end = addr + hdr->pages * PAGE_SIZE; } -static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; @@ -804,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_unlock_ro(hdr); bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); @@ -934,6 +977,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; + case BPF_JMP32 | BPF_JEQ | BPF_K: + case BPF_JMP32 | BPF_JNE | BPF_K: + case BPF_JMP32 | BPF_JGT | BPF_K: + case BPF_JMP32 | BPF_JLT | BPF_K: + case BPF_JMP32 | BPF_JGE | BPF_K: + case BPF_JMP32 | BPF_JLE | BPF_K: + case BPF_JMP32 | BPF_JSGT | BPF_K: + case BPF_JMP32 | BPF_JSLT | BPF_K: + case BPF_JMP32 | BPF_JSGE | BPF_K: + case BPF_JMP32 | BPF_JSLE | BPF_K: + case BPF_JMP32 | BPF_JSET | BPF_K: + /* Accommodate for extra offset in case of a backjump. */ + off = from->off; + if (off < 0) + off -= 2; + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, + off); + break; + case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); @@ -1130,6 +1194,31 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_2(JMP, CALL), \ /* Exit instruction. */ \ INSN_2(JMP, EXIT), \ + /* 32-bit Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP32, JEQ, X), \ + INSN_3(JMP32, JNE, X), \ + INSN_3(JMP32, JGT, X), \ + INSN_3(JMP32, JLT, X), \ + INSN_3(JMP32, JGE, X), \ + INSN_3(JMP32, JLE, X), \ + INSN_3(JMP32, JSGT, X), \ + INSN_3(JMP32, JSLT, X), \ + INSN_3(JMP32, JSGE, X), \ + INSN_3(JMP32, JSLE, X), \ + INSN_3(JMP32, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP32, JEQ, K), \ + INSN_3(JMP32, JNE, K), \ + INSN_3(JMP32, JGT, K), \ + INSN_3(JMP32, JLT, K), \ + INSN_3(JMP32, JGE, K), \ + INSN_3(JMP32, JLE, K), \ + INSN_3(JMP32, JSGT, K), \ + INSN_3(JMP32, JSLT, K), \ + INSN_3(JMP32, JSGE, K), \ + INSN_3(JMP32, JSLE, K), \ + INSN_3(JMP32, JSET, K), \ /* Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP, JEQ, X), \ @@ -1202,8 +1291,9 @@ bool bpf_opcode_in_insntable(u8 code) #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context - * @ctx: is the data we are operating on + * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions + * @stack: is the eBPF storage stack * * Decode and execute eBPF instructions. */ @@ -1390,145 +1480,49 @@ select_insn: out: CONT; } - /* JMP */ JMP_JA: insn += insn->off; CONT; - JMP_JEQ_X: - if (DST == SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JEQ_K: - if (DST == IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_X: - if (DST != SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_K: - if (DST != IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_X: - if (DST > SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_K: - if (DST > IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLT_X: - if (DST < SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLT_K: - if (DST < IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_X: - if (DST >= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_K: - if (DST >= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLE_X: - if (DST <= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLE_K: - if (DST <= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_X: - if (((s64) DST) > ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_K: - if (((s64) DST) > ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLT_X: - if (((s64) DST) < ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLT_K: - if (((s64) DST) < ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_X: - if (((s64) DST) >= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_K: - if (((s64) DST) >= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLE_X: - if (((s64) DST) <= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLE_K: - if (((s64) DST) <= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_X: - if (DST & SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_K: - if (DST & IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; JMP_EXIT: return BPF_R0; - + /* JMP */ +#define COND_JMP(SIGN, OPCODE, CMP_OP) \ + JMP_##OPCODE##_X: \ + if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP32_##OPCODE##_X: \ + if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP_##OPCODE##_K: \ + if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP32_##OPCODE##_K: \ + if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; + COND_JMP(u, JEQ, ==) + COND_JMP(u, JNE, !=) + COND_JMP(u, JGT, >) + COND_JMP(u, JLT, <) + COND_JMP(u, JGE, >=) + COND_JMP(u, JLE, <=) + COND_JMP(u, JSET, &) + COND_JMP(s, JSGT, >) + COND_JMP(s, JSLT, <) + COND_JMP(s, JSGE, >=) + COND_JMP(s, JSLE, <=) +#undef COND_JMP /* STX and ST and LDX*/ #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ @@ -2036,6 +2030,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_spin_lock_proto __weak; +const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; @@ -2101,6 +2097,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } +DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +EXPORT_SYMBOL(bpf_stats_enabled_key); +int sysctl_bpf_stats_enabled __read_mostly; + /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS #include <linux/bpf_trace.h> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8974b3755670..3c18260403dd 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -162,10 +162,14 @@ static void cpu_map_kthread_stop(struct work_struct *work) static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + unsigned int hard_start_headroom; unsigned int frame_size; void *pkt_data_start; struct sk_buff *skb; + /* Part of headroom was reserved to xdpf */ + hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; + /* build_skb need to place skb_shared_info after SKB end, and * also want to know the memory "truesize". Thus, need to * know the memory frame size backing xdp_buff. @@ -183,15 +187,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) + + frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - pkt_data_start = xdpf->data - xdpf->headroom; + pkt_data_start = xdpf->data - hard_start_headroom; skb = build_skb(pkt_data_start, frame_size); if (!skb) return NULL; - skb_reserve(skb, xdpf->headroom); + skb_reserve(skb, hard_start_headroom); __skb_put(skb, xdpf->len); if (xdpf->metasize) skb_metadata_set(skb, xdpf->metasize); @@ -205,6 +209,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * - RX ring dev queue index (skb_record_rx_queue) */ + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + return skb; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d6b76377cb6e..de73f55e42fd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -67,7 +67,7 @@ const char *const bpf_class_string[8] = { [BPF_STX] = "stx", [BPF_ALU] = "alu", [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", + [BPF_JMP32] = "jmp32", [BPF_ALU64] = "alu64", }; @@ -136,23 +136,22 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, else print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", - insn->code, insn->dst_reg, - class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', + insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", + class == BPF_ALU ? 'w' : 'r', insn->src_reg); } else { - verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d %s %d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", insn->imm); } } else if (class == BPF_STX) { @@ -220,7 +219,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } - } else if (class == BPF_JMP) { + } else if (class == BPF_JMP32 || class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { @@ -244,13 +243,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_EXIT)) { verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, + verbose(cbs->private_data, + "(%02x) if %c%d %s %c%d goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], + class == BPF_JMP32 ? 'w' : 'r', insn->src_reg, insn->off); } else { - verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, + verbose(cbs->private_data, + "(%02x) if %c%d %s 0x%x goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index f9274114c88d..fed15cf94dca 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) BITS_PER_LONG == 64; } -static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) -{ - u32 size = htab->map.value_size; - - if (percpu || fd_htab_map_needs_adjust(htab)) - size = round_up(size, 8); - return size; -} - static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab_size_value(htab, percpu); + u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } + check_and_init_map_lock(&htab->map, + l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); if (percpu) { + size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); - } else { + } else if (fd_htab_map_needs_adjust(htab)) { + size = round_up(size, 8); memcpy(l_new->key + round_up(key_size, 8), value, size); + } else { + copy_map_value(&htab->map, + l_new->key + round_up(key_size, 8), + value); } l_new->hash = hash; @@ -805,11 +804,11 @@ dec_count: static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { - if (l_old && map_flags == BPF_NOEXIST) + if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; - if (!l_old && map_flags == BPF_EXIST) + if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; @@ -828,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -841,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, b = __select_bucket(htab, hash); head = &b->head; + if (unlikely(map_flags & BPF_F_LOCK)) { + if (unlikely(!map_value_has_spin_lock(map))) + return -EINVAL; + /* find an element without taking the bucket lock */ + l_old = lookup_nulls_elem_raw(head, hash, key, key_size, + htab->n_buckets); + ret = check_flags(htab, l_old, map_flags); + if (ret) + return ret; + if (l_old) { + /* grab the element lock and update value in place */ + copy_map_value_locked(map, + l_old->key + round_up(key_size, 8), + value, false); + return 0; + } + /* fall through, grab the bucket lock and lookup again. + * 99.9% chance that the element won't be found, + * but second lookup under lock has to be done. + */ + } + /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); @@ -850,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; + if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { + /* first lookup without the bucket lock didn't find the element, + * but second lookup with the bucket lock found it. + * This case is highly unlikely, but has to be dealt with: + * grab the element lock in addition to the bucket lock + * and update element in place + */ + copy_map_value_locked(map, + l_old->key + round_up(key_size, 8), + value, false); + ret = 0; + goto err; + } + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_old); if (IS_ERR(l_new)) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a74972b07e74..a411fc17d265 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -221,6 +221,102 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg2_type = ARG_CONST_SIZE, }; +#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ + arch_spinlock_t *l = (void *)lock; + union { + __u32 val; + arch_spinlock_t lock; + } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; + + compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); + BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); + BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); + arch_spin_lock(l); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ + arch_spinlock_t *l = (void *)lock; + + arch_spin_unlock(l); +} + +#else + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ + atomic_t *l = (void *)lock; + + BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); + do { + atomic_cond_read_relaxed(l, !VAL); + } while (atomic_xchg(l, 1)); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ + atomic_t *l = (void *)lock; + + atomic_set_release(l, 0); +} + +#endif + +static DEFINE_PER_CPU(unsigned long, irqsave_flags); + +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +{ + unsigned long flags; + + local_irq_save(flags); + __bpf_spin_lock(lock); + __this_cpu_write(irqsave_flags, flags); + return 0; +} + +const struct bpf_func_proto bpf_spin_lock_proto = { + .func = bpf_spin_lock, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_SPIN_LOCK, +}; + +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +{ + unsigned long flags; + + flags = __this_cpu_read(irqsave_flags); + __bpf_spin_unlock(lock); + local_irq_restore(flags); + return 0; +} + +const struct bpf_func_proto bpf_spin_unlock_proto = { + .func = bpf_spin_unlock, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_SPIN_LOCK, +}; + +void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, + bool lock_src) +{ + struct bpf_spin_lock *lock; + + if (lock_src) + lock = src + map->spin_lock_off; + else + lock = dst + map->spin_lock_off; + preempt_disable(); + ____bpf_spin_lock(lock); + copy_map_value(map, dst, src); + ____bpf_spin_unlock(lock); + preempt_enable(); +} + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 2ada5e21dfa6..bc53e5b20ddc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -554,19 +554,6 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ } EXPORT_SYMBOL(bpf_prog_get_type_path); -static void bpf_evict_inode(struct inode *inode) -{ - enum bpf_type type; - - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); - if (!bpf_inode_type(inode, &type)) - bpf_any_put(inode->i_private, type); -} - /* * Display the mount options in /proc/mounts. */ @@ -579,11 +566,22 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void bpf_free_inode(struct inode *inode) +{ + enum bpf_type type; + + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); + if (!bpf_inode_type(inode, &type)) + bpf_any_put(inode->i_private, type); + free_inode_nonrcu(inode); +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, - .evict_inode = bpf_evict_inode, + .free_inode = bpf_free_inode, }; enum { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 07a34ef562a0..6b572e2de7fb 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags != BPF_ANY && flags != BPF_EXIST) + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) + return -EINVAL; + + if (unlikely(flags & BPF_NOEXIST)) + return -EINVAL; + + if (unlikely((flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, if (!storage) return -ENOENT; + if (flags & BPF_F_LOCK) { + copy_map_value_locked(map, storage->buf->data, value, false); + return 0; + } + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + map->value_size, __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, @@ -147,6 +159,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return -ENOMEM; memcpy(&new->data[0], value, map->value_size); + check_and_init_map_lock(map, new->data); new = xchg(&storage->buf, new); kfree_rcu(new, rcu); @@ -483,6 +496,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, storage->buf = kmalloc_node(size, flags, map->numa_node); if (!storage->buf) goto enomem; + check_and_init_map_lock(map, storage->buf->data); } else { storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); if (!storage->percpu_buf) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index abf1002080df..93a5cbbde421 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -471,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) } if (!node || node->prefixlen != key->prefixlen || + node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) { ret = -ENOENT; goto out; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 52378d3e34b3..3dff41403583 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } + if (map_value_has_spin_lock(inner_map)) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ if (inner_map->ops == &array_map_ops) @@ -53,6 +58,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; + inner_map_meta->spin_lock_off = inner_map->spin_lock_off; /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 54cf2b9c44a4..ba635209ae9a 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -35,6 +35,7 @@ static DECLARE_RWSEM(bpf_devs_lock); struct bpf_offload_dev { const struct bpf_prog_offload_ops *ops; struct list_head netdevs; + void *priv; }; struct bpf_offload_netdev { @@ -173,6 +174,41 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env) return ret; } +void +bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, + struct bpf_insn *insn) +{ + const struct bpf_prog_offload_ops *ops; + struct bpf_prog_offload *offload; + int ret = -EOPNOTSUPP; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + ops = offload->offdev->ops; + if (!offload->opt_failed && ops->replace_insn) + ret = ops->replace_insn(env, off, insn); + offload->opt_failed |= ret; + } + up_read(&bpf_devs_lock); +} + +void +bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_prog_offload *offload; + int ret = -EOPNOTSUPP; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + if (!offload->opt_failed && offload->offdev->ops->remove_insns) + ret = offload->offdev->ops->remove_insns(env, off, cnt); + offload->opt_failed |= ret; + } + up_read(&bpf_devs_lock); +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; @@ -634,7 +670,7 @@ unlock: EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); struct bpf_offload_dev * -bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; int err; @@ -653,6 +689,7 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) return ERR_PTR(-ENOMEM); offdev->ops = ops; + offdev->priv = priv; INIT_LIST_HEAD(&offdev->netdevs); return offdev; @@ -665,3 +702,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) kfree(offdev); } EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); + +void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) +{ + return offdev->priv; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d43b14535827..950ab2f28922 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -44,7 +44,7 @@ static void do_up_read(struct irq_work *entry) struct stack_map_irq_work *work; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read(work->sem); + up_read_non_owner(work->sem); work->sem = NULL; } @@ -338,6 +338,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } else { work->sem = ¤t->mm->mmap_sem; irq_work_queue(&work->irq_work); + /* + * The irq_work will release the mmap_sem with + * up_read_non_owner(). The rwsem_release() is called + * here to release the lock from lockdep's perspective. + */ + rwsem_release(¤t->mm->mmap_sem.dep_map, 1, _RET_IP_); } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8577bb7f8be6..afca36f53c49 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -136,21 +136,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) void *bpf_map_area_alloc(size_t size, int numa_node) { - /* We definitely need __GFP_NORETRY, so OOM killer doesn't - * trigger under memory pressure as we really just want to - * fail instead. + /* We really just want to fail instead of triggering OOM killer + * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, + * which is used for lower order allocation requests. + * + * It has been observed that higher order allocation requests done by + * vmalloc with __GFP_NORETRY being set might fail due to not trying + * to reclaim memory from the page cache, thus we set + * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO; + + const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | flags, numa_node); + area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + numa_node); if (area != NULL) return area; } - return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, - __builtin_return_address(0)); + return __vmalloc_node_flags_caller(size, numa_node, + GFP_KERNEL | __GFP_RETRY_MAYFAIL | + flags, __builtin_return_address(0)); } void bpf_map_area_free(void *area) @@ -463,7 +471,7 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; @@ -478,6 +486,22 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf, if (!value_type || value_size != map->value_size) return -EINVAL; + map->spin_lock_off = btf_find_spin_lock(btf, value_type); + + if (map_value_has_spin_lock(map)) { + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + return -ENOTSUPP; + if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > + map->value_size) { + WARN_ONCE(1, + "verifier bug spin_lock_off %d value_size %d\n", + map->spin_lock_off, map->value_size); + return -EFAULT; + } + } + if (map->ops->map_check_btf) ret = map->ops->map_check_btf(map, btf, key_type, value_type); @@ -542,6 +566,8 @@ static int map_create(union bpf_attr *attr) map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; + } else { + map->spin_lock_off = -EINVAL; } err = security_bpf_map_alloc(map); @@ -559,12 +585,12 @@ static int map_create(union bpf_attr *attr) err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. - * bpf_map_put() is needed because the above + * bpf_map_put_with_uref() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ - bpf_map_put(map); + bpf_map_put_with_uref(map); return err; } @@ -664,7 +690,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags static int map_lookup_elem(union bpf_attr *attr) { @@ -680,6 +706,9 @@ static int map_lookup_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; + if (attr->flags & ~BPF_F_LOCK) + return -EINVAL; + f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -690,6 +719,12 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -745,7 +780,13 @@ static int map_lookup_elem(union bpf_attr *attr) err = -ENOENT; } else { err = 0; - memcpy(value, ptr, value_size); + if (attr->flags & BPF_F_LOCK) + /* lock 'ptr' and copy everything but lock */ + copy_map_value_locked(map, value, ptr, true); + else + copy_map_value(map, value, ptr); + /* mask lock, since value wasn't zero inited */ + check_and_init_map_lock(map, value); } rcu_read_unlock(); } @@ -808,6 +849,12 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -1219,6 +1266,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); @@ -1244,24 +1292,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +static void bpf_prog_get_stats(const struct bpf_prog *prog, + struct bpf_prog_stats *stats) +{ + u64 nsecs = 0, cnt = 0; + int cpu; + + for_each_possible_cpu(cpu) { + const struct bpf_prog_stats *st; + unsigned int start; + u64 tnsecs, tcnt; + + st = per_cpu_ptr(prog->aux->stats, cpu); + do { + start = u64_stats_fetch_begin_irq(&st->syncp); + tnsecs = st->nsecs; + tcnt = st->cnt; + } while (u64_stats_fetch_retry_irq(&st->syncp, start)); + nsecs += tnsecs; + cnt += tcnt; + } + stats->nsecs = nsecs; + stats->cnt = cnt; +} + #ifdef CONFIG_PROC_FS static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_prog *prog = filp->private_data; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; + struct bpf_prog_stats stats; + bpf_prog_get_stats(prog, &stats); bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "prog_type:\t%u\n" "prog_jited:\t%u\n" "prog_tag:\t%s\n" "memlock:\t%llu\n" - "prog_id:\t%u\n", + "prog_id:\t%u\n" + "run_time_ns:\t%llu\n" + "run_cnt:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, - prog->aux->id); + prog->aux->id, + stats.nsecs, + stats.cnt); } #endif @@ -1562,6 +1640,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) } bpf_prog_kallsyms_add(prog); + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); return err; free_used_maps: @@ -1986,7 +2065,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) fd = bpf_map_new_fd(map, f_flags); if (fd < 0) - bpf_map_put(map); + bpf_map_put_with_uref(map); return fd; } @@ -2083,6 +2162,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_prog_info info = {}; u32 info_len = attr->info.info_len; + struct bpf_prog_stats stats; char __user *uinsns; u32 ulen; int err; @@ -2122,6 +2202,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, if (err) return err; + bpf_prog_get_stats(prog, &stats); + info.run_time_ns = stats.nsecs; + info.run_cnt = stats.cnt; + if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56674a7c3778..09d5d972c9ff 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -212,7 +212,8 @@ struct bpf_call_arg_meta { int access_size; s64 msize_smax_value; u64 msize_umax_value; - int ptr_id; + int ref_obj_id; + int func_id; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -330,35 +331,38 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) type == PTR_TO_PACKET_META; } -static bool reg_type_may_be_null(enum bpf_reg_type type) +static bool type_is_sk_pointer(enum bpf_reg_type type) { - return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL; + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK; } -static bool type_is_refcounted(enum bpf_reg_type type) -{ - return type == PTR_TO_SOCKET; -} - -static bool type_is_refcounted_or_null(enum bpf_reg_type type) +static bool reg_type_may_be_null(enum bpf_reg_type type) { - return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL; + return type == PTR_TO_MAP_VALUE_OR_NULL || + type == PTR_TO_SOCKET_OR_NULL || + type == PTR_TO_SOCK_COMMON_OR_NULL || + type == PTR_TO_TCP_SOCK_OR_NULL; } -static bool reg_is_refcounted(const struct bpf_reg_state *reg) +static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { - return type_is_refcounted(reg->type); + return reg->type == PTR_TO_MAP_VALUE && + map_value_has_spin_lock(reg->map_ptr); } -static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) +static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { - return type_is_refcounted_or_null(reg->type); + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCKET_OR_NULL || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_TCP_SOCK_OR_NULL; } -static bool arg_type_is_refcounted(enum bpf_arg_type type) +static bool arg_type_may_be_refcounted(enum bpf_arg_type type) { - return type == ARG_PTR_TO_SOCKET; + return type == ARG_PTR_TO_SOCK_COMMON; } /* Determine whether the function releases some resources allocated by another @@ -370,6 +374,18 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } +static bool is_acquire_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp; +} + +static bool is_ptr_cast_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_tcp_sock || + func_id == BPF_FUNC_sk_fullsock; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -385,6 +401,10 @@ static const char * const reg_type_str[] = { [PTR_TO_FLOW_KEYS] = "flow_keys", [PTR_TO_SOCKET] = "sock", [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", }; static char slot_type_char[] = { @@ -440,6 +460,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); + if (reg_type_may_be_refcounted_or_null(t)) + verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) @@ -611,13 +633,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) } /* release function corresponding to acquire_reference_state(). Idempotent. */ -static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +static int release_reference_state(struct bpf_func_state *state, int ptr_id) { int i, last_idx; - if (!ptr_id) - return -EFAULT; - last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id == ptr_id) { @@ -629,21 +648,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id) return 0; } } - return -EFAULT; -} - -/* variation on the above for cases where we expect that there must be an - * outstanding reference for the specified ptr_id. - */ -static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) -{ - struct bpf_func_state *state = cur_func(env); - int err; - - err = __release_reference_state(state, ptr_id); - if (WARN_ON_ONCE(err != 0)) - verbose(env, "verifier internal error: can't release reference\n"); - return err; + return -EINVAL; } static int transfer_reference_state(struct bpf_func_state *dst, @@ -712,6 +717,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; + dst_state->active_spin_lock = src->active_spin_lock; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1095,7 +1101,7 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; - if (BPF_CLASS(code) != BPF_JMP) + if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; @@ -1201,6 +1207,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return true; default: return false; @@ -1483,6 +1493,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, if (err) verbose(env, "R%d max value is outside of the array range\n", regno); + + if (map_value_has_spin_lock(reg->map_ptr)) { + u32 lock = reg->map_ptr->spin_lock_off; + + /* if any part of struct bpf_spin_lock can be touched by + * load/store reject this program. + * To check that [x1, x2) overlaps with [y1, y2) + * it is sufficient to check x1 < y2 && y1 < x2. + */ + if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && + lock < reg->umax_value + off + size) { + verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); + return -EACCES; + } + } return err; } @@ -1617,12 +1642,14 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, return 0; } -static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, enum bpf_access_type t) +static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, + u32 regno, int off, int size, + enum bpf_access_type t) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; - struct bpf_insn_access_aux info; + struct bpf_insn_access_aux info = {}; + bool valid; if (reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", @@ -1630,13 +1657,31 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } - if (!bpf_sock_is_valid_access(off, size, t, &info)) { - verbose(env, "invalid bpf_sock access off=%d size=%d\n", - off, size); - return -EACCES; + switch (reg->type) { + case PTR_TO_SOCK_COMMON: + valid = bpf_sock_common_is_valid_access(off, size, t, &info); + break; + case PTR_TO_SOCKET: + valid = bpf_sock_is_valid_access(off, size, t, &info); + break; + case PTR_TO_TCP_SOCK: + valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); + break; + default: + valid = false; } - return 0; + + if (valid) { + env->insn_aux_data[insn_idx].ctx_field_size = + info.ctx_field_size; + return 0; + } + + verbose(env, "R%d invalid %s access off=%d size=%d\n", + regno, reg_type_str[reg->type], off, size); + + return -EACCES; } static bool __is_pointer_value(bool allow_ptr_leaks, @@ -1662,8 +1707,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = reg_state(env, regno); - return reg->type == PTR_TO_CTX || - reg->type == PTR_TO_SOCKET; + return reg->type == PTR_TO_CTX; +} + +static bool is_sk_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + return type_is_sk_pointer(reg->type); } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) @@ -1774,6 +1825,12 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_SOCKET: pointer_desc = "sock "; break; + case PTR_TO_SOCK_COMMON: + pointer_desc = "sock_common "; + break; + case PTR_TO_TCP_SOCK: + pointer_desc = "tcp_sock "; + break; default: break; } @@ -1840,8 +1897,9 @@ continue_func: } frame++; if (frame >= MAX_CALL_FRAMES) { - WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); - return -EFAULT; + verbose(env, "the call stack of %d frames is too deep !\n", + frame); + return -E2BIG; } goto process_func; } @@ -1977,11 +2035,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) + if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, value_regno); - else + } else { mark_reg_known_zero(env, regs, value_regno); + if (reg_type_may_be_null(reg_type)) + regs[value_regno].id = ++env->id_gen; + } regs[value_regno].type = reg_type; } @@ -2027,12 +2088,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_flow_keys_access(env, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_SOCKET) { + } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { - verbose(env, "cannot write into socket\n"); + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); return -EACCES; } - err = check_sock_access(env, regno, off, size, t); + err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { @@ -2076,7 +2138,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg)) { + is_flow_key_reg(env, insn->dst_reg) || + is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); @@ -2192,6 +2255,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +/* Implementation details: + * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL + * Two bpf_map_lookups (even with the same key) will have different reg->id. + * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after + * value_or_null->value transition, since the verifier only cares about + * the range of access to valid map value pointer and doesn't care about actual + * address of the map element. + * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps + * reg->id > 0 after value_or_null->value transition. By doing so + * two bpf_map_lookups will be considered two different pointers that + * point to different bpf_spin_locks. + * The verifier allows taking only one bpf_spin_lock at a time to avoid + * dead-locks. + * Since only one bpf_spin_lock is allowed the checks are simpler than + * reg_is_refcounted() logic. The verifier needs to remember only + * one spin_lock instead of array of acquired_refs. + * cur_state->active_spin_lock remembers which map value element got locked + * and clears it after bpf_spin_unlock. + */ +static int process_spin_lock(struct bpf_verifier_env *env, int regno, + bool is_lock) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_verifier_state *cur = env->cur_state; + bool is_const = tnum_is_const(reg->var_off); + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "R%d is not a pointer to map_value\n", regno); + return -EINVAL; + } + if (!is_const) { + verbose(env, + "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map->btf) { + verbose(env, + "map '%s' has to have BTF in order to use bpf_spin_lock\n", + map->name); + return -EINVAL; + } + if (!map_value_has_spin_lock(map)) { + if (map->spin_lock_off == -E2BIG) + verbose(env, + "map '%s' has more than one 'struct bpf_spin_lock'\n", + map->name); + else if (map->spin_lock_off == -ENOENT) + verbose(env, + "map '%s' doesn't have 'struct bpf_spin_lock'\n", + map->name); + else + verbose(env, + "map '%s' is not a struct type or bpf_spin_lock is mangled\n", + map->name); + return -EINVAL; + } + if (map->spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", + val + reg->off); + return -EINVAL; + } + if (is_lock) { + if (cur->active_spin_lock) { + verbose(env, + "Locking two bpf_spin_locks are not allowed\n"); + return -EINVAL; + } + cur->active_spin_lock = reg->id; + } else { + if (!cur->active_spin_lock) { + verbose(env, "bpf_spin_unlock without taking a lock\n"); + return -EINVAL; + } + if (cur->active_spin_lock != reg->id) { + verbose(env, "bpf_spin_unlock of different lock\n"); + return -EINVAL; + } + cur->active_spin_lock = 0; + } + return 0; +} + static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_MEM || @@ -2258,16 +2406,31 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; - } else if (arg_type == ARG_PTR_TO_SOCKET) { - expected_type = PTR_TO_SOCKET; - if (type != expected_type) + } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { + expected_type = PTR_TO_SOCK_COMMON; + /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ + if (!type_is_sk_pointer(type)) goto err_type; - if (meta->ptr_id || !reg->id) { - verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", - meta->ptr_id, reg->id); + if (reg->ref_obj_id) { + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; + } + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + if (meta->func_id == BPF_FUNC_spin_lock) { + if (process_spin_lock(env, regno, true)) + return -EACCES; + } else if (meta->func_id == BPF_FUNC_spin_unlock) { + if (process_spin_lock(env, regno, false)) + return -EACCES; + } else { + verbose(env, "verifier internal error\n"); return -EFAULT; } - meta->ptr_id = reg->id; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2573,32 +2736,38 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) return true; } -static bool check_refcount_ok(const struct bpf_func_proto *fn) +static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) { int count = 0; - if (arg_type_is_refcounted(fn->arg1_type)) + if (arg_type_may_be_refcounted(fn->arg1_type)) count++; - if (arg_type_is_refcounted(fn->arg2_type)) + if (arg_type_may_be_refcounted(fn->arg2_type)) count++; - if (arg_type_is_refcounted(fn->arg3_type)) + if (arg_type_may_be_refcounted(fn->arg3_type)) count++; - if (arg_type_is_refcounted(fn->arg4_type)) + if (arg_type_may_be_refcounted(fn->arg4_type)) count++; - if (arg_type_is_refcounted(fn->arg5_type)) + if (arg_type_may_be_refcounted(fn->arg5_type)) count++; + /* A reference acquiring function cannot acquire + * another refcounted ptr. + */ + if (is_acquire_function(func_id) && count) + return false; + /* We only support one arg being unreferenced at the moment, * which is sufficient for the helper functions we have right now. */ return count <= 1; } -static int check_func_proto(const struct bpf_func_proto *fn) +static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && - check_refcount_ok(fn) ? 0 : -EINVAL; + check_refcount_ok(fn, func_id) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2632,19 +2801,20 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) } static void release_reg_references(struct bpf_verifier_env *env, - struct bpf_func_state *state, int id) + struct bpf_func_state *state, + int ref_obj_id) { struct bpf_reg_state *regs = state->regs, *reg; int i; for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].id == id) + if (regs[i].ref_obj_id == ref_obj_id) mark_reg_unknown(env, regs, i); bpf_for_each_spilled_reg(i, state, reg) { if (!reg) continue; - if (reg_is_refcounted(reg) && reg->id == id) + if (reg->ref_obj_id == ref_obj_id) __mark_reg_unknown(reg); } } @@ -2653,15 +2823,20 @@ static void release_reg_references(struct bpf_verifier_env *env, * resources. Identify all copies of the same pointer and clear the reference. */ static int release_reference(struct bpf_verifier_env *env, - struct bpf_call_arg_meta *meta) + int ref_obj_id) { struct bpf_verifier_state *vstate = env->cur_state; + int err; int i; + err = release_reference_state(cur_func(env), ref_obj_id); + if (err) + return err; + for (i = 0; i <= vstate->curframe; i++) - release_reg_references(env, vstate->frame[i], meta->ptr_id); + release_reg_references(env, vstate->frame[i], ref_obj_id); - return release_reference_state(env, meta->ptr_id); + return 0; } static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -2880,13 +3055,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn); + err = check_func_proto(fn, func_id); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); return err; } + meta.func_id = func_id; /* check args */ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) @@ -2925,9 +3101,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return err; } } else if (is_release_function(func_id)) { - err = release_reference(env, &meta); - if (err) + err = release_reference(env, meta.ref_obj_id); + if (err) { + verbose(env, "func %s#%d reference has not been acquired before\n", + func_id_name(func_id), func_id); return err; + } } regs = cur_regs(env); @@ -2969,23 +3148,42 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].map_ptr = meta.map_ptr; if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + if (map_value_has_spin_lock(meta.map_ptr)) + regs[BPF_REG_0].id = ++env->id_gen; } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { - int id = acquire_reference_state(env, insn_idx); - if (id < 0) - return id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = id; + if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = id; + } else { + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = ++env->id_gen; + } + } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } + if (is_ptr_cast_function(func_id)) + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + do_refine_retval_range(regs, fn->ret_type, func_id, &meta); err = check_map_func_compatibility(env, meta.map_ptr, func_id); @@ -3184,7 +3382,7 @@ do_sim: *dst_reg = *ptr_reg; } ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); - if (!ptr_is_dst_reg) + if (!ptr_is_dst_reg && ret) *dst_reg = tmp; return !ret ? -EFAULT : 0; } @@ -3239,6 +3437,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -3936,15 +4138,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static void __find_good_pkt_pointers(struct bpf_func_state *state, + struct bpf_reg_state *dst_reg, + enum bpf_reg_type type, u16 new_range) +{ + struct bpf_reg_state *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + reg = &state->regs[i]; + if (reg->type == type && reg->id == dst_reg->id) + /* keep the maximum range already checked */ + reg->range = max(reg->range, new_range); + } + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } +} + static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i, j; + int i; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -4009,20 +4231,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ - for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == type && regs[i].id == dst_reg->id) - /* keep the maximum range already checked */ - regs[i].range = max(regs[i].range, new_range); - - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } - } + for (i = 0; i <= vstate->curframe; i++) + __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, + new_range); } /* compute branch direction of the expression "if (reg opcode val) goto target;" @@ -4031,11 +4242,50 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * 0 - branch will not be taken and fall-through to next insn * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] */ -static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, + bool is_jmp32) { + struct bpf_reg_state reg_lo; + s64 sval; + if (__is_pointer_value(false, reg)) return -1; + if (is_jmp32) { + reg_lo = *reg; + reg = ®_lo; + /* For JMP32, only low 32 bits are compared, coerce_reg_to_size + * could truncate high bits and update umin/umax according to + * information of low bits. + */ + coerce_reg_to_size(reg, 4); + /* smin/smax need special handling. For example, after coerce, + * if smin_value is 0x00000000ffffffffLL, the value is -1 when + * used as operand to JMP32. It is a negative number from s32's + * point of view, while it is a positive number when seen as + * s64. The smin/smax are kept as s64, therefore, when used with + * JMP32, they need to be transformed into s32, then sign + * extended back to s64. + * + * Also, smin/smax were copied from umin/umax. If umin/umax has + * different sign bit, then min/max relationship doesn't + * maintain after casting into s32, for this case, set smin/smax + * to safest range. + */ + if ((reg->umax_value ^ reg->umin_value) & + (1ULL << 31)) { + reg->smin_value = S32_MIN; + reg->smax_value = S32_MAX; + } + reg->smin_value = (s64)(s32)reg->smin_value; + reg->smax_value = (s64)(s32)reg->smax_value; + + val = (u32)val; + sval = (s64)(s32)val; + } else { + sval = (s64)val; + } + switch (opcode) { case BPF_JEQ: if (tnum_is_const(reg->var_off)) @@ -4058,9 +4308,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSGT: - if (reg->smin_value > (s64)val) + if (reg->smin_value > sval) return 1; - else if (reg->smax_value < (s64)val) + else if (reg->smax_value < sval) return 0; break; case BPF_JLT: @@ -4070,9 +4320,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSLT: - if (reg->smax_value < (s64)val) + if (reg->smax_value < sval) return 1; - else if (reg->smin_value >= (s64)val) + else if (reg->smin_value >= sval) return 0; break; case BPF_JGE: @@ -4082,9 +4332,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSGE: - if (reg->smin_value >= (s64)val) + if (reg->smin_value >= sval) return 1; - else if (reg->smax_value < (s64)val) + else if (reg->smax_value < sval) return 0; break; case BPF_JLE: @@ -4094,9 +4344,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSLE: - if (reg->smax_value <= (s64)val) + if (reg->smax_value <= sval) return 1; - else if (reg->smin_value > (s64)val) + else if (reg->smin_value > sval) return 0; break; } @@ -4104,6 +4354,29 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return -1; } +/* Generate min value of the high 32-bit from TNUM info. */ +static u64 gen_hi_min(struct tnum var) +{ + return var.value & ~0xffffffffULL; +} + +/* Generate max value of the high 32-bit from TNUM info. */ +static u64 gen_hi_max(struct tnum var) +{ + return (var.value | var.mask) & ~0xffffffffULL; +} + +/* Return true if VAL is compared with a s64 sign extended from s32, and they + * are with the same signedness. + */ +static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) +{ + return ((s32)sval >= 0 && + reg->smin_value >= 0 && reg->smax_value <= S32_MAX) || + ((s32)sval < 0 && + reg->smax_value <= 0 && reg->smin_value >= S32_MIN); +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -4111,8 +4384,10 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) */ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, - u8 opcode) + u8 opcode, bool is_jmp32) { + s64 sval; + /* If the dst_reg is a pointer, we can't learn anything about its * variable offset from the compare (unless src_reg were a pointer into * the same object, but we don't bother with that. @@ -4122,19 +4397,31 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, if (__is_pointer_value(false, false_reg)) return; + val = is_jmp32 ? (u32)val : val; + sval = is_jmp32 ? (s64)(s32)val : (s64)val; + switch (opcode) { case BPF_JEQ: - /* If this is false then we know nothing Jon Snow, but if it is - * true then we know for sure. - */ - __mark_reg_known(true_reg, val); - break; case BPF_JNE: - /* If this is true we know nothing Jon Snow, but if it is false - * we know the value for sure; + { + struct bpf_reg_state *reg = + opcode == BPF_JEQ ? true_reg : false_reg; + + /* For BPF_JEQ, if this is false we know nothing Jon Snow, but + * if it is true we know the value for sure. Likewise for + * BPF_JNE. */ - __mark_reg_known(false_reg, val); + if (is_jmp32) { + u64 old_v = reg->var_off.value; + u64 hi_mask = ~0xffffffffULL; + + reg->var_off.value = (old_v & hi_mask) | val; + reg->var_off.mask &= hi_mask; + } else { + __mark_reg_known(reg, val); + } break; + } case BPF_JSET: false_reg->var_off = tnum_and(false_reg->var_off, tnum_const(~val)); @@ -4142,38 +4429,61 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, true_reg->var_off = tnum_or(true_reg->var_off, tnum_const(val)); break; - case BPF_JGT: - false_reg->umax_value = min(false_reg->umax_value, val); - true_reg->umin_value = max(true_reg->umin_value, val + 1); - break; - case BPF_JSGT: - false_reg->smax_value = min_t(s64, false_reg->smax_value, val); - true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); - break; - case BPF_JLT: - false_reg->umin_value = max(false_reg->umin_value, val); - true_reg->umax_value = min(true_reg->umax_value, val - 1); - break; - case BPF_JSLT: - false_reg->smin_value = max_t(s64, false_reg->smin_value, val); - true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); - break; case BPF_JGE: - false_reg->umax_value = min(false_reg->umax_value, val - 1); - true_reg->umin_value = max(true_reg->umin_value, val); + case BPF_JGT: + { + u64 false_umax = opcode == BPF_JGT ? val : val - 1; + u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + + if (is_jmp32) { + false_umax += gen_hi_max(false_reg->var_off); + true_umin += gen_hi_min(true_reg->var_off); + } + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); break; + } case BPF_JSGE: - false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); - true_reg->smin_value = max_t(s64, true_reg->smin_value, val); + case BPF_JSGT: + { + s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + + /* If the full s64 was not sign-extended from s32 then don't + * deduct further info. + */ + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); break; + } case BPF_JLE: - false_reg->umin_value = max(false_reg->umin_value, val + 1); - true_reg->umax_value = min(true_reg->umax_value, val); + case BPF_JLT: + { + u64 false_umin = opcode == BPF_JLT ? val : val + 1; + u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + + if (is_jmp32) { + false_umin += gen_hi_min(false_reg->var_off); + true_umax += gen_hi_max(true_reg->var_off); + } + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); break; + } case BPF_JSLE: - false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); - true_reg->smax_value = min_t(s64, true_reg->smax_value, val); + case BPF_JSLT: + { + s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); break; + } default: break; } @@ -4196,24 +4506,34 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, */ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, - u8 opcode) + u8 opcode, bool is_jmp32) { + s64 sval; + if (__is_pointer_value(false, false_reg)) return; + val = is_jmp32 ? (u32)val : val; + sval = is_jmp32 ? (s64)(s32)val : (s64)val; + switch (opcode) { case BPF_JEQ: - /* If this is false then we know nothing Jon Snow, but if it is - * true then we know for sure. - */ - __mark_reg_known(true_reg, val); - break; case BPF_JNE: - /* If this is true we know nothing Jon Snow, but if it is false - * we know the value for sure; - */ - __mark_reg_known(false_reg, val); + { + struct bpf_reg_state *reg = + opcode == BPF_JEQ ? true_reg : false_reg; + + if (is_jmp32) { + u64 old_v = reg->var_off.value; + u64 hi_mask = ~0xffffffffULL; + + reg->var_off.value = (old_v & hi_mask) | val; + reg->var_off.mask &= hi_mask; + } else { + __mark_reg_known(reg, val); + } break; + } case BPF_JSET: false_reg->var_off = tnum_and(false_reg->var_off, tnum_const(~val)); @@ -4221,38 +4541,58 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, true_reg->var_off = tnum_or(true_reg->var_off, tnum_const(val)); break; - case BPF_JGT: - true_reg->umax_value = min(true_reg->umax_value, val - 1); - false_reg->umin_value = max(false_reg->umin_value, val); - break; - case BPF_JSGT: - true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); - false_reg->smin_value = max_t(s64, false_reg->smin_value, val); - break; - case BPF_JLT: - true_reg->umin_value = max(true_reg->umin_value, val + 1); - false_reg->umax_value = min(false_reg->umax_value, val); - break; - case BPF_JSLT: - true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); - false_reg->smax_value = min_t(s64, false_reg->smax_value, val); - break; case BPF_JGE: - true_reg->umax_value = min(true_reg->umax_value, val); - false_reg->umin_value = max(false_reg->umin_value, val + 1); + case BPF_JGT: + { + u64 false_umin = opcode == BPF_JGT ? val : val + 1; + u64 true_umax = opcode == BPF_JGT ? val - 1 : val; + + if (is_jmp32) { + false_umin += gen_hi_min(false_reg->var_off); + true_umax += gen_hi_max(true_reg->var_off); + } + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); break; + } case BPF_JSGE: - true_reg->smax_value = min_t(s64, true_reg->smax_value, val); - false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); + case BPF_JSGT: + { + s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; + + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); break; + } case BPF_JLE: - true_reg->umin_value = max(true_reg->umin_value, val); - false_reg->umax_value = min(false_reg->umax_value, val - 1); + case BPF_JLT: + { + u64 false_umax = opcode == BPF_JLT ? val : val - 1; + u64 true_umin = opcode == BPF_JLT ? val + 1 : val; + + if (is_jmp32) { + false_umax += gen_hi_max(false_reg->var_off); + true_umin += gen_hi_min(true_reg->var_off); + } + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); break; + } case BPF_JSLE: - true_reg->smin_value = max_t(s64, true_reg->smin_value, val); - false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); + case BPF_JSLT: + { + s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; + + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); break; + } default: break; } @@ -4343,17 +4683,46 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { reg->type = PTR_TO_SOCKET; + } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { + reg->type = PTR_TO_SOCK_COMMON; + } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { + reg->type = PTR_TO_TCP_SOCK; } - if (is_null || !reg_is_refcounted(reg)) { - /* We don't need id from this point onwards anymore, - * thus we should better reset it, so that state - * pruning has chances to take effect. + if (is_null) { + /* We don't need id and ref_obj_id from this point + * onwards anymore, thus we should better reset it, + * so that state pruning has chances to take effect. + */ + reg->id = 0; + reg->ref_obj_id = 0; + } else if (!reg_may_point_to_spin_lock(reg)) { + /* For not-NULL ptr, reg->ref_obj_id will be reset + * in release_reg_references(). + * + * reg->id is still used by spin_lock ptr. Other + * than spin_lock ptr type, reg->id can be reset. */ reg->id = 0; } } } +static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, + bool is_null) +{ + struct bpf_reg_state *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + mark_ptr_or_null_reg(state, reg, id, is_null); + } +} + /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ @@ -4361,24 +4730,20 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg, *regs = state->regs; + struct bpf_reg_state *regs = state->regs; + u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - int i, j; - - if (reg_is_refcounted_or_null(®s[regno]) && is_null) - __release_reference_state(state, id); + int i; - for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, ®s[i], id, is_null); + if (ref_obj_id && ref_obj_id == id && is_null) + /* regs[regno] is in the " == NULL" branch. + * No one could have freed the reference state before + * doing the NULL check. + */ + WARN_ON_ONCE(release_reference_state(state, id)); - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } - } + for (i = 0; i <= vstate->curframe; i++) + __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); } static bool try_match_pkt_pointers(const struct bpf_insn *insn, @@ -4390,6 +4755,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, if (BPF_SRC(insn->code) != BPF_X) return false; + /* Pointers are always 64-bit. */ + if (BPF_CLASS(insn->code) == BPF_JMP32) + return false; + switch (BPF_OP(insn->code)) { case BPF_JGT: if ((dst_reg->type == PTR_TO_PACKET && @@ -4482,16 +4851,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs; u8 opcode = BPF_OP(insn->code); + bool is_jmp32; int err; - if (opcode > BPF_JSLE) { - verbose(env, "invalid BPF_JMP opcode %x\n", opcode); + /* Only conditional jumps are expected to reach here. */ + if (opcode == BPF_JA || opcode > BPF_JSLE) { + verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } @@ -4507,7 +4878,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } else { if (insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } } @@ -4518,9 +4889,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; dst_reg = ®s[insn->dst_reg]; + is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; if (BPF_SRC(insn->code) == BPF_K) { - int pred = is_branch_taken(dst_reg, insn->imm, opcode); + int pred = is_branch_taken(dst_reg, insn->imm, opcode, + is_jmp32); if (pred == 1) { /* only follow the goto, ignore fall-through */ @@ -4548,30 +4921,51 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * comparable. */ if (BPF_SRC(insn->code) == BPF_X) { + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state lo_reg0 = *dst_reg; + struct bpf_reg_state lo_reg1 = *src_reg; + struct bpf_reg_state *src_lo, *dst_lo; + + dst_lo = &lo_reg0; + src_lo = &lo_reg1; + coerce_reg_to_size(dst_lo, 4); + coerce_reg_to_size(src_lo, 4); + if (dst_reg->type == SCALAR_VALUE && - regs[insn->src_reg].type == SCALAR_VALUE) { - if (tnum_is_const(regs[insn->src_reg].var_off)) + src_reg->type == SCALAR_VALUE) { + if (tnum_is_const(src_reg->var_off) || + (is_jmp32 && tnum_is_const(src_lo->var_off))) reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, regs[insn->src_reg].var_off.value, - opcode); - else if (tnum_is_const(dst_reg->var_off)) + dst_reg, + is_jmp32 + ? src_lo->var_off.value + : src_reg->var_off.value, + opcode, is_jmp32); + else if (tnum_is_const(dst_reg->var_off) || + (is_jmp32 && tnum_is_const(dst_lo->var_off))) reg_set_min_max_inv(&other_branch_regs[insn->src_reg], - ®s[insn->src_reg], - dst_reg->var_off.value, opcode); - else if (opcode == BPF_JEQ || opcode == BPF_JNE) + src_reg, + is_jmp32 + ? dst_lo->var_off.value + : dst_reg->var_off.value, + opcode, is_jmp32); + else if (!is_jmp32 && + (opcode == BPF_JEQ || opcode == BPF_JNE)) /* Comparing for equality, we can combine knowledge */ reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], - ®s[insn->src_reg], - ®s[insn->dst_reg], opcode); + src_reg, dst_reg, opcode); } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, insn->imm, opcode); + dst_reg, insn->imm, opcode, is_jmp32); } - /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ - if (BPF_SRC(insn->code) == BPF_K && + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). + * NOTE: these optimizations below are related with pointer comparison + * which will never be JMP32. + */ + if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && reg_type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either @@ -4713,6 +5107,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } + if (env->cur_state->active_spin_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); + return -EINVAL; + } + if (regs[BPF_REG_6].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -4900,7 +5299,8 @@ peek_stack: goto check_state; t = insn_stack[cur_stack - 1]; - if (BPF_CLASS(insns[t].code) == BPF_JMP) { + if (BPF_CLASS(insns[t].code) == BPF_JMP || + BPF_CLASS(insns[t].code) == BPF_JMP32) { u8 opcode = BPF_OP(insns[t].code); if (opcode == BPF_EXIT) { @@ -4997,13 +5397,14 @@ static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, union bpf_attr __user *uattr) { - u32 i, nfuncs, urec_size, min_size, prev_offset; + u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; void __user *urecord; + u32 prev_offset = 0; int ret = 0; nfuncs = attr->func_info_cnt; @@ -5447,8 +5848,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. - * We don't care about the 'id' value, because nothing - * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) + * 'id' is not compared, since it's only used for maps with + * bpf_spin_lock inside map element and in such cases if + * the rest of the prog is valid for one map element then + * it's valid for all map elements regardless of the key + * used in bpf_map_lookup() */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && @@ -5496,6 +5900,10 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -5651,6 +6059,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->speculative && !cur->speculative) return false; + if (old->active_spin_lock != cur->active_spin_lock) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -5684,15 +6095,17 @@ static int propagate_liveness(struct bpf_verifier_env *env, } /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - /* We don't need to worry about FP liveness because it's read-only */ - for (i = 0; i < BPF_REG_FP; i++) { - if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) - continue; - if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], - &vparent->frame[vstate->curframe]->regs[i]); - if (err) - return err; + for (frame = 0; frame <= vstate->curframe; frame++) { + /* We don't need to worry about FP liveness, it's read-only */ + for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { + if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ) + continue; + if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) { + err = mark_reg_read(env, &vstate->frame[frame]->regs[i], + &vparent->frame[frame]->regs[i]); + if (err) + return err; + } } } @@ -5813,6 +6226,10 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_CTX: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return false; default: return true; @@ -6055,7 +6472,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - } else if (class == BPF_JMP) { + } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { @@ -6063,11 +6480,18 @@ static int do_check(struct bpf_verifier_env *env) insn->off != 0 || (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL) || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } + if (env->cur_state->active_spin_lock && + (insn->src_reg == BPF_PSEUDO_CALL || + insn->imm != BPF_FUNC_spin_unlock)) { + verbose(env, "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else @@ -6079,7 +6503,8 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -6091,11 +6516,17 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } + if (env->cur_state->active_spin_lock) { + verbose(env, "bpf_spin_unlock is missing\n"); + return -EINVAL; + } + if (state->curframe) { /* exit from nested function */ env->prev_insn_idx = env->insn_idx; @@ -6193,6 +6624,19 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } +static bool is_tracing_prog_type(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + return true; + default: + return false; + } +} + static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) @@ -6215,6 +6659,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if ((is_tracing_prog_type(prog->type) || + prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && + map_value_has_spin_lock(map)) { + verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); @@ -6272,17 +6723,17 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* valid generic load 64-bit imm */ goto next_insn; - if (insn->src_reg != BPF_PSEUDO_MAP_FD) { - verbose(env, - "unrecognized bpf_ld_imm64 insn\n"); + if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || + insn[1].imm != 0) { + verbose(env, "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } - f = fdget(insn->imm); + f = fdget(insn[0].imm); map = __bpf_map_get(f); if (IS_ERR(map)) { verbose(env, "fd %d is not pointing to valid bpf_map\n", - insn->imm); + insn[0].imm); return PTR_ERR(map); } @@ -6431,6 +6882,153 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } +static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, + u32 off, u32 cnt) +{ + int i, j; + + /* find first prog starting at or after off (first to remove) */ + for (i = 0; i < env->subprog_cnt; i++) + if (env->subprog_info[i].start >= off) + break; + /* find first prog starting at or after off + cnt (first to stay) */ + for (j = i; j < env->subprog_cnt; j++) + if (env->subprog_info[j].start >= off + cnt) + break; + /* if j doesn't start exactly at off + cnt, we are just removing + * the front of previous prog + */ + if (env->subprog_info[j].start != off + cnt) + j--; + + if (j > i) { + struct bpf_prog_aux *aux = env->prog->aux; + int move; + + /* move fake 'exit' subprog as well */ + move = env->subprog_cnt + 1 - j; + + memmove(env->subprog_info + i, + env->subprog_info + j, + sizeof(*env->subprog_info) * move); + env->subprog_cnt -= j - i; + + /* remove func_info */ + if (aux->func_info) { + move = aux->func_info_cnt - j; + + memmove(aux->func_info + i, + aux->func_info + j, + sizeof(*aux->func_info) * move); + aux->func_info_cnt -= j - i; + /* func_info->insn_off is set after all code rewrites, + * in adjust_btf_func() - no need to adjust + */ + } + } else { + /* convert i from "first prog to remove" to "first to adjust" */ + if (env->subprog_info[i].start == off) + i++; + } + + /* update fake 'exit' subprog as well */ + for (; i <= env->subprog_cnt; i++) + env->subprog_info[i].start -= cnt; + + return 0; +} + +static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, + u32 cnt) +{ + struct bpf_prog *prog = env->prog; + u32 i, l_off, l_cnt, nr_linfo; + struct bpf_line_info *linfo; + + nr_linfo = prog->aux->nr_linfo; + if (!nr_linfo) + return 0; + + linfo = prog->aux->linfo; + + /* find first line info to remove, count lines to be removed */ + for (i = 0; i < nr_linfo; i++) + if (linfo[i].insn_off >= off) + break; + + l_off = i; + l_cnt = 0; + for (; i < nr_linfo; i++) + if (linfo[i].insn_off < off + cnt) + l_cnt++; + else + break; + + /* First live insn doesn't match first live linfo, it needs to "inherit" + * last removed linfo. prog is already modified, so prog->len == off + * means no live instructions after (tail of the program was removed). + */ + if (prog->len != off && l_cnt && + (i == nr_linfo || linfo[i].insn_off != off + cnt)) { + l_cnt--; + linfo[--i].insn_off = off + cnt; + } + + /* remove the line info which refer to the removed instructions */ + if (l_cnt) { + memmove(linfo + l_off, linfo + i, + sizeof(*linfo) * (nr_linfo - i)); + + prog->aux->nr_linfo -= l_cnt; + nr_linfo = prog->aux->nr_linfo; + } + + /* pull all linfo[i].insn_off >= off + cnt in by cnt */ + for (i = l_off; i < nr_linfo; i++) + linfo[i].insn_off -= cnt; + + /* fix up all subprogs (incl. 'exit') which start >= off */ + for (i = 0; i <= env->subprog_cnt; i++) + if (env->subprog_info[i].linfo_idx > l_off) { + /* program may have started in the removed region but + * may not be fully removed + */ + if (env->subprog_info[i].linfo_idx >= l_off + l_cnt) + env->subprog_info[i].linfo_idx -= l_cnt; + else + env->subprog_info[i].linfo_idx = l_off; + } + + return 0; +} + +static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + unsigned int orig_prog_len = env->prog->len; + int err; + + if (bpf_prog_is_dev_bound(env->prog->aux)) + bpf_prog_offload_remove_insns(env, off, cnt); + + err = bpf_remove_insns(env->prog, off, cnt); + if (err) + return err; + + err = adjust_subprog_starts_after_remove(env, off, cnt); + if (err) + return err; + + err = bpf_adj_linfo_after_remove(env, off, cnt); + if (err) + return err; + + memmove(aux_data + off, aux_data + off + cnt, + sizeof(*aux_data) * (orig_prog_len - off - cnt)); + + return 0; +} + /* The verifier does more data flow analysis than llvm and will not * explore branches that are dead at run time. Malicious programs can * have dead code too. Therefore replace all dead at-run-time code @@ -6457,6 +7055,91 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) } } +static bool insn_is_cond_jump(u8 code) +{ + u8 op; + + if (BPF_CLASS(code) == BPF_JMP32) + return true; + + if (BPF_CLASS(code) != BPF_JMP) + return false; + + op = BPF_OP(code); + return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; +} + +static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + const int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!insn_is_cond_jump(insn->code)) + continue; + + if (!aux_data[i + 1].seen) + ja.off = insn->off; + else if (!aux_data[i + 1 + insn->off].seen) + ja.off = 0; + else + continue; + + if (bpf_prog_is_dev_bound(env->prog->aux)) + bpf_prog_offload_replace_insn(env, i, &ja); + + memcpy(insn, &ja, sizeof(ja)); + } +} + +static int opt_remove_dead_code(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + int j; + + j = 0; + while (i + j < insn_cnt && !aux_data[i + j].seen) + j++; + if (!j) + continue; + + err = verifier_remove_insns(env, i, j); + if (err) + return err; + insn_cnt = env->prog->len; + } + + return 0; +} + +static int opt_remove_nops(struct bpf_verifier_env *env) +{ + const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + if (memcmp(&insn[i], &ja, sizeof(ja))) + continue; + + err = verifier_remove_insns(env, i, 1); + if (err) + return err; + insn_cnt--; + i--; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -6549,8 +7232,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = ops->convert_ctx_access; break; case PTR_TO_SOCKET: + case PTR_TO_SOCK_COMMON: convert_ctx_access = bpf_sock_convert_ctx_access; break; + case PTR_TO_TCP_SOCK: + convert_ctx_access = bpf_tcp_sock_convert_ctx_access; + break; default: continue; } @@ -6678,7 +7365,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; - func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + /* BPF_PROG_RUN doesn't call subprogs directly, + * hence main prog stats include the runtime of subprogs. + * subprogs don't have IDs and not reachable via prog_get_next_id + * func[i]->aux->stats will never be accessed and stays NULL + */ + func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) goto out_free; memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], @@ -6917,7 +7609,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) u32 off_reg; aux = &env->insn_aux_data[i + delta]; - if (!aux->alu_state) + if (!aux->alu_state || + aux->alu_state == BPF_ALU_NON_POINTER) continue; isneg = aux->alu_state & BPF_ALU_NEG_VALUE; @@ -7147,7 +7840,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, { struct bpf_verifier_env *env; struct bpf_verifier_log *log; - int ret = -EINVAL; + int i, len, ret = -EINVAL; + bool is_priv; /* no program is valid */ if (ARRAY_SIZE(bpf_verifier_ops) == 0) @@ -7161,12 +7855,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, return -ENOMEM; log = &env->log; + len = (*prog)->len; env->insn_aux_data = - vzalloc(array_size(sizeof(struct bpf_insn_aux_data), - (*prog)->len)); + vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len)); ret = -ENOMEM; if (!env->insn_aux_data) goto err_free_env; + for (i = 0; i < len; i++) + env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; @@ -7194,6 +7890,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; + is_priv = capable(CAP_SYS_ADMIN); + env->allow_ptr_leaks = is_priv; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; @@ -7211,8 +7910,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (!env->explored_states) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = check_subprogs(env); if (ret < 0) goto skip_full_check; @@ -7242,8 +7939,17 @@ skip_full_check: ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ - if (ret == 0) - sanitize_dead_code(env); + if (is_priv) { + if (ret == 0) + opt_hard_wire_dead_code_branches(env); + if (ret == 0) + ret = opt_remove_dead_code(env); + if (ret == 0) + ret = opt_remove_nops(env); + } else { + if (ret == 0) + sanitize_dead_code(env); + } if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ diff --git a/kernel/capability.c b/kernel/capability.c index 1e1c0236f55b..1444f3954d75 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -93,9 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); - /* - * fall through - v3 is otherwise equivalent to v2. - */ + /* fall through - v3 is otherwise equivalent to v2. */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; @@ -299,7 +297,7 @@ bool has_ns_capability(struct task_struct *t, int ret; rcu_read_lock(); - ret = security_capable(__task_cred(t), ns, cap); + ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); @@ -340,7 +338,7 @@ bool has_ns_capability_noaudit(struct task_struct *t, int ret; rcu_read_lock(); - ret = security_capable_noaudit(__task_cred(t), ns, cap); + ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); @@ -363,7 +361,9 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } -static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +static bool ns_capable_common(struct user_namespace *ns, + int cap, + unsigned int opts) { int capable; @@ -372,8 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) BUG(); } - capable = audit ? security_capable(current_cred(), ns, cap) : - security_capable_noaudit(current_cred(), ns, cap); + capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; @@ -394,7 +393,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) */ bool ns_capable(struct user_namespace *ns, int cap) { - return ns_capable_common(ns, cap, true); + return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); @@ -412,11 +411,30 @@ EXPORT_SYMBOL(ns_capable); */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { - return ns_capable_common(ns, cap, false); + return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** + * ns_capable_setid - Determine if the current task has a superior capability + * in effect, while signalling that this check is being done from within a + * setid syscall. + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable_setid(struct user_namespace *ns, int cap) +{ + return ns_capable_common(ns, cap, CAP_OPT_INSETID); +} +EXPORT_SYMBOL(ns_capable_setid); + +/** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * @@ -448,10 +466,11 @@ EXPORT_SYMBOL(capable); bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { + if (WARN_ON_ONCE(!cap_valid(cap))) return false; - if (security_capable(file->f_cred, ns, cap) == 0) + if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; @@ -500,10 +519,12 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; + rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) - ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, + CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c950864016e2..30e39f3932ad 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -7,6 +7,7 @@ #include <linux/workqueue.h> #include <linux/list.h> #include <linux/refcount.h> +#include <linux/fs_context.h> #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; @@ -37,6 +38,31 @@ extern void __init enable_debug_cgroup(void); } while (0) /* + * The cgroup filesystem superblock creation/mount context. + */ +struct cgroup_fs_context { + struct kernfs_fs_context kfc; + struct cgroup_root *root; + struct cgroup_namespace *ns; + unsigned int flags; /* CGRP_ROOT_* flags */ + + /* cgroup1 bits */ + bool cpuset_clone_children; + bool none; /* User explicitly requested empty subsystem */ + bool all_ss; /* Seen 'all' option */ + u16 subsys_mask; /* Selected subsystems */ + char *name; /* Hierarchy name */ + char *release_agent; /* Path for release notifications */ +}; + +static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + return container_of(kfc, struct cgroup_fs_context, kfc); +} + +/* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other * direction, a css_set is naturally associated with multiple cgroups. @@ -117,16 +143,6 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -struct cgroup_sb_opts { - u16 subsys_mask; - unsigned int flags; - char *release_agent; - bool cpuset_clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; -}; - extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; @@ -197,12 +213,10 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); void cgroup_free_root(struct cgroup_root *root); -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); +void init_cgroup_root(struct cgroup_fs_context *ctx); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns); +int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); @@ -246,14 +260,15 @@ extern const struct proc_ns_operations cgroupns_operations; */ extern struct cftype cgroup1_base_files[]; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; +extern const struct fs_parameter_description cgroup1_fs_parameters; int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); void cgroup1_release_agent(struct work_struct *work); void cgroup1_check_for_release(struct cgroup *cgrp); -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns); +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param); +int cgroup1_get_tree(struct fs_context *fc); +int cgroup1_reconfigure(struct fs_context *ctx); #endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 583b969b0c0e..c126b34fd4ff 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -13,9 +13,12 @@ #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> +#include <linux/fs_parser.h> #include <trace/events/cgroup.h> +#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__) + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -906,172 +909,195 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo return 0; } -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); +enum cgroup1_param { + Opt_all, + Opt_clone_children, + Opt_cpuset_v2_mode, + Opt_name, + Opt_none, + Opt_noprefix, + Opt_release_agent, + Opt_xattr, +}; - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; +static const struct fs_parameter_spec cgroup1_param_specs[] = { + fsparam_flag ("all", Opt_all), + fsparam_flag ("clone_children", Opt_clone_children), + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + fsparam_string("name", Opt_name), + fsparam_flag ("none", Opt_none), + fsparam_flag ("noprefix", Opt_noprefix), + fsparam_string("release_agent", Opt_release_agent), + fsparam_flag ("xattr", Opt_xattr), + {} +}; - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "cpuset_v2_mode")) { - opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - - /* blocked by boot param? */ - if (cgroup_no_v1_named) - return -ENOENT; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; +const struct fs_parameter_description cgroup1_fs_parameters = { + .name = "cgroup1", + .specs = cgroup1_param_specs, +}; - continue; +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct cgroup_subsys *ss; + struct fs_parse_result result; + int opt, i; + + opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result); + if (opt == -ENOPARAM) { + if (strcmp(param->key, "source") == 0) { + fc->source = param->string; + param->string = NULL; + return 0; } - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) + if (strcmp(param->key, ss->legacy_name)) continue; - if (!cgroup_ssid_enabled(i)) + ctx->subsys_mask |= (1 << i); + return 0; + } + return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key); + } + if (opt < 0) + return opt; + + switch (opt) { + case Opt_none: + /* Explicitly have no subsystems */ + ctx->none = true; + break; + case Opt_all: + ctx->all_ss = true; + break; + case Opt_noprefix: + ctx->flags |= CGRP_ROOT_NOPREFIX; + break; + case Opt_clone_children: + ctx->cpuset_clone_children = true; + break; + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + break; + case Opt_xattr: + ctx->flags |= CGRP_ROOT_XATTR; + break; + case Opt_release_agent: + /* Specifying two release agents is forbidden */ + if (ctx->release_agent) + return cg_invalf(fc, "cgroup1: release_agent respecified"); + ctx->release_agent = param->string; + param->string = NULL; + break; + case Opt_name: + /* blocked by boot param? */ + if (cgroup_no_v1_named) + return -ENOENT; + /* Can't specify an empty name */ + if (!param->size) + return cg_invalf(fc, "cgroup1: Empty name"); + if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) + return cg_invalf(fc, "cgroup1: Name too long"); + /* Must match [\w.-]+ */ + for (i = 0; i < param->size; i++) { + char c = param->string[i]; + if (isalnum(c)) continue; - if (cgroup1_ssid_disabled(i)) + if ((c == '.') || (c == '-') || (c == '_')) continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; + return cg_invalf(fc, "cgroup1: Invalid name"); } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; + /* Specifying two names is forbidden */ + if (ctx->name) + return cg_invalf(fc, "cgroup1: name respecified"); + ctx->name = param->string; + param->string = NULL; + break; } + return 0; +} + +static int check_cgroupfs_options(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + u16 mask = U16_MAX; + u16 enabled = 0; + struct cgroup_subsys *ss; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + enabled |= 1 << i; + + ctx->subsys_mask &= enabled; /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' + * In absense of 'none', 'name=' or subsystem name options, + * let's default to 'all'. */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) - opts->subsys_mask |= (1 << i); + if (!ctx->subsys_mask && !ctx->none && !ctx->name) + ctx->all_ss = true; + + if (ctx->all_ss) { + /* Mutually exclusive option 'all' + subsystem name */ + if (ctx->subsys_mask) + return cg_invalf(fc, "cgroup1: subsys name conflicts with all"); + /* 'all' => select all the subsystems */ + ctx->subsys_mask = enabled; + } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; + if (!ctx->subsys_mask && !ctx->name) + return cg_invalf(fc, "cgroup1: Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; + if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) + return cg_invalf(fc, "cgroup1: noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; + if (ctx->subsys_mask && ctx->none) + return cg_invalf(fc, "cgroup1: none used incorrectly"); return 0; } -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) +int cgroup1_reconfigure(struct fs_context *fc) { - int ret = 0; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_sb_opts opts; + int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); + ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; + added_mask = ctx->subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); + if ((ctx->flags ^ root->flags) || + (ctx->name && strcmp(ctx->name, root->name))) { + cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", + ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } @@ -1088,17 +1114,15 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - if (opts.release_agent) { + if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); + strcpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: - kfree(opts.release_agent); - kfree(opts.name); mutex_unlock(&cgroup_mutex); return ret; } @@ -1106,30 +1130,30 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, - .remount_fs = cgroup1_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns) +/* + * The guts of cgroup1 mount - find or create cgroup_root to use. + * Called with cgroup_mutex held; returns 0 on success, -E... on + * error and positive - in case when the candidate is busy dying. + * On success it stashes a reference to cgroup_root into given + * cgroup_fs_context; that reference is *NOT* counting towards the + * cgroup_root refcount. + */ +static int cgroup1_root_to_use(struct fs_context *fc) { - struct super_block *pinned_sb = NULL; - struct cgroup_sb_opts opts; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; - struct dentry *dentry; int i, ret; - bool new_root = false; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); + ret = check_cgroupfs_options(fc); if (ret) - goto out_unlock; + return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may @@ -1139,16 +1163,12 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || + if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) + return 1; /* restart */ cgroup_put(&ss->root->cgrp); } @@ -1163,8 +1183,8 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ - if (opts.name) { - if (strcmp(opts.name, root->name)) + if (ctx->name) { + if (strcmp(ctx->name, root->name)) continue; name_match = true; } @@ -1173,42 +1193,18 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { + if ((ctx->subsys_mask || ctx->none) && + (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; - ret = -EBUSY; - goto out_unlock; + return -EBUSY; } - if (root->flags ^ opts.flags) + if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - /* - * We want to reuse @root whose lifetime is governed by its - * ->cgrp. Let's check whether @root is alive and keep it - * that way. As cgroup_kill_sb() can happen anytime, we - * want to block it by pinning the sb so that @root doesn't - * get killed before mount is complete. - * - * With the sb pinned, tryget_live can reliably indicate - * whether @root can be reused. If it's being killed, - * drain it. We can use wait_queue for the wait but this - * path is super cold. Let's just sleep a bit and retry. - */ - pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR(pinned_sb) || - !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - if (!IS_ERR_OR_NULL(pinned_sb)) - deactivate_super(pinned_sb); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - - ret = 0; - goto out_unlock; + ctx->root = root; + return 0; } /* @@ -1216,62 +1212,58 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } + if (!ctx->subsys_mask && !ctx->none) + return cg_invalf(fc, "cgroup1: No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } + if (ctx->ns != &init_cgroup_ns) + return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } - new_root = true; + if (!root) + return -ENOMEM; - init_cgroup_root(root, &opts); + ctx->root = root; + init_cgroup_root(ctx); - ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); + ret = cgroup_setup_root(root, ctx->subsys_mask); if (ret) cgroup_free_root(root); + return ret; +} -out_unlock: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); +int cgroup1_get_tree(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + int ret; - if (ret) - return ERR_PTR(ret); + /* Check if the caller has permission to mount. */ + if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, - CGROUP_SUPER_MAGIC, ns); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - /* - * There's a race window after we release cgroup_mutex and before - * allocating a superblock. Make sure a concurrent process won't - * be able to re-use the root during this window by delaying the - * initialization of root refcnt. - */ - if (new_root) { - mutex_lock(&cgroup_mutex); - percpu_ref_reinit(&root->cgrp.self.refcnt); - mutex_unlock(&cgroup_mutex); - } + ret = cgroup1_root_to_use(fc); + if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) + ret = 1; /* restart */ - /* - * If @pinned_sb, we're reusing an existing root and holding an - * extra ref on its sb. Mount is complete. Put the extra ref. - */ - if (pinned_sb) - deactivate_super(pinned_sb); + mutex_unlock(&cgroup_mutex); - return dentry; + if (!ret) + ret = cgroup_do_get_tree(fc); + + if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { + struct super_block *sb = fc->root->d_sb; + dput(fc->root); + deactivate_locked_super(sb); + ret = 1; + } + + if (unlikely(ret > 0)) { + msleep(10); + return restart_syscall(); + } + return ret; } static int __init cgroup1_wq_init(void) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f31bd61c9466..3f2b4bde0f9c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -54,6 +54,7 @@ #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> +#include <linux/fs_parser.h> #include <linux/sched/cputime.h> #include <linux/psi.h> #include <net/sock.h> @@ -197,7 +198,7 @@ static u64 css_serial_nr_next = 1; */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; -static u16 have_free_callback __read_mostly; +static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ @@ -1772,26 +1773,37 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) -{ - char *token; +enum cgroup2_param { + Opt_nsdelegate, + nr__cgroup2_params +}; - *root_flags = 0; +static const struct fs_parameter_spec cgroup2_param_specs[] = { + fsparam_flag ("nsdelegate", Opt_nsdelegate), + {} +}; - if (!data || *data == '\0') - return 0; +static const struct fs_parameter_description cgroup2_fs_parameters = { + .name = "cgroup2", + .specs = cgroup2_param_specs, +}; - while ((token = strsep(&data, ",")) != NULL) { - if (!strcmp(token, "nsdelegate")) { - *root_flags |= CGRP_ROOT_NS_DELEGATE; - continue; - } +static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; - pr_err("cgroup2: unknown option \"%s\"\n", token); - return -EINVAL; - } + opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result); + if (opt < 0) + return opt; - return 0; + switch (opt) { + case Opt_nsdelegate: + ctx->flags |= CGRP_ROOT_NS_DELEGATE; + return 0; + } + return -EINVAL; } static void apply_cgroup_root_flags(unsigned int root_flags) @@ -1811,16 +1823,11 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root return 0; } -static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) +static int cgroup_reconfigure(struct fs_context *fc) { - unsigned int root_flags; - int ret; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) - return ret; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); - apply_cgroup_root_flags(root_flags); + apply_cgroup_root_flags(ctx->flags); return 0; } @@ -1908,8 +1915,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) +void init_cgroup_root(struct cgroup_fs_context *ctx) { + struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD(&root->root_list); @@ -1918,16 +1926,16 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); - root->flags = opts->flags; - if (opts->release_agent) - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); - if (opts->name) - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); - if (opts->cpuset_clone_children) + root->flags = ctx->flags; + if (ctx->release_agent) + strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); + if (ctx->name) + strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); + if (ctx->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1944,7 +1952,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, - ref_flags, GFP_KERNEL); + 0, GFP_KERNEL); if (ret) goto out; @@ -2028,57 +2036,104 @@ out: return ret; } -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns) +int cgroup_do_get_tree(struct fs_context *fc) { - struct dentry *dentry; - bool new_sb; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + int ret; - dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); + ctx->kfc.root = ctx->root->kf_root; + if (fc->fs_type == &cgroup2_fs_type) + ctx->kfc.magic = CGROUP2_SUPER_MAGIC; + else + ctx->kfc.magic = CGROUP_SUPER_MAGIC; + ret = kernfs_get_tree(fc); /* * In non-init cgroup namespace, instead of root cgroup's dentry, * we return the dentry corresponding to the cgroupns->root_cgrp. */ - if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + if (!ret && ctx->ns != &init_cgroup_ns) { struct dentry *nsdentry; + struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); - cgrp = cset_cgroup_from_root(ns->root_cset, root); + cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); - dput(dentry); - dentry = nsdentry; + nsdentry = kernfs_node_dentry(cgrp->kn, sb); + dput(fc->root); + fc->root = nsdentry; + if (IS_ERR(nsdentry)) { + ret = PTR_ERR(nsdentry); + deactivate_locked_super(sb); + } } - if (IS_ERR(dentry) || !new_sb) - cgroup_put(&root->cgrp); + if (!ctx->kfc.new_sb_created) + cgroup_put(&ctx->root->cgrp); - return dentry; + return ret; } -static struct dentry *cgroup_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +/* + * Destroy a cgroup filesystem context. + */ +static void cgroup_fs_context_free(struct fs_context *fc) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct dentry *dentry; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + + kfree(ctx->name); + kfree(ctx->release_agent); + put_cgroup_ns(ctx->ns); + kernfs_free_fs_context(fc); + kfree(ctx); +} + +static int cgroup_get_tree(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; - get_cgroup_ns(ns); + cgrp_dfl_visible = true; + cgroup_get_live(&cgrp_dfl_root.cgrp); + ctx->root = &cgrp_dfl_root; - /* Check if the caller has permission to mount. */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { - put_cgroup_ns(ns); - return ERR_PTR(-EPERM); - } + ret = cgroup_do_get_tree(fc); + if (!ret) + apply_cgroup_root_flags(ctx->flags); + return ret; +} + +static const struct fs_context_operations cgroup_fs_context_ops = { + .free = cgroup_fs_context_free, + .parse_param = cgroup2_parse_param, + .get_tree = cgroup_get_tree, + .reconfigure = cgroup_reconfigure, +}; + +static const struct fs_context_operations cgroup1_fs_context_ops = { + .free = cgroup_fs_context_free, + .parse_param = cgroup1_parse_param, + .get_tree = cgroup1_get_tree, + .reconfigure = cgroup1_reconfigure, +}; + +/* + * Initialise the cgroup filesystem creation/reconfiguration context. Notably, + * we select the namespace we're going to use. + */ +static int cgroup_init_fs_context(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx; + + ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; /* * The first time anyone tries to mount a cgroup, enable the list @@ -2087,29 +2142,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - if (fs_type == &cgroup2_fs_type) { - unsigned int root_flags; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) { - put_cgroup_ns(ns); - return ERR_PTR(ret); - } - - cgrp_dfl_visible = true; - cgroup_get_live(&cgrp_dfl_root.cgrp); - - dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, - CGROUP2_SUPER_MAGIC, ns); - if (!IS_ERR(dentry)) - apply_cgroup_root_flags(root_flags); - } else { - dentry = cgroup1_mount(&cgroup_fs_type, flags, data, - CGROUP_SUPER_MAGIC, ns); - } - - put_cgroup_ns(ns); - return dentry; + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); + fc->fs_private = &ctx->kfc; + if (fc->fs_type == &cgroup2_fs_type) + fc->ops = &cgroup_fs_context_ops; + else + fc->ops = &cgroup1_fs_context_ops; + if (fc->user_ns) + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->ns->user_ns); + fc->global = true; + return 0; } static void cgroup_kill_sb(struct super_block *sb) @@ -2118,33 +2162,33 @@ static void cgroup_kill_sb(struct super_block *sb) struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* - * If @root doesn't have any mounts or children, start killing it. + * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. * * And don't kill the default root. */ - if (!list_empty(&root->cgrp.self.children) || - root == &cgrp_dfl_root) - cgroup_put(&root->cgrp); - else + if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) percpu_ref_kill(&root->cgrp.self.refcnt); - + cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } struct file_system_type cgroup_fs_type = { - .name = "cgroup", - .mount = cgroup_mount, - .kill_sb = cgroup_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "cgroup", + .init_fs_context = cgroup_init_fs_context, + .parameters = &cgroup1_fs_parameters, + .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { - .name = "cgroup2", - .mount = cgroup_mount, - .kill_sb = cgroup_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "cgroup2", + .init_fs_context = cgroup_init_fs_context, + .parameters = &cgroup2_fs_parameters, + .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, @@ -3533,6 +3577,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ + struct cftype *cft = of->kn->priv; + + if (cft->poll) + return cft->poll(of, pt); + + return kernfs_generic_poll(of, pt); +} + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); @@ -3571,6 +3625,7 @@ static struct kernfs_ops cgroup_kf_single_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; @@ -3579,6 +3634,7 @@ static struct kernfs_ops cgroup_kf_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, @@ -5267,7 +5323,6 @@ int cgroup_rmdir(struct kernfs_node *kn) static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, @@ -5313,7 +5368,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; - have_free_callback |= (bool)ss->free << ss->id; + have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been @@ -5334,11 +5389,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) */ int __init cgroup_init_early(void) { - static struct cgroup_sb_opts __initdata opts; + static struct cgroup_fs_context __initdata ctx; struct cgroup_subsys *ss; int i; - init_cgroup_root(&cgrp_dfl_root, &opts); + ctx.root = &cgrp_dfl_root; + init_cgroup_root(&ctx); cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; RCU_INIT_POINTER(init_task.cgroups, &init_css_set); @@ -5399,7 +5455,7 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); - BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex); @@ -5749,16 +5805,19 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_free(struct task_struct *task) +void cgroup_release(struct task_struct *task) { - struct css_set *cset = task_css_set(task); struct cgroup_subsys *ss; int ssid; - do_each_subsys_mask(ss, ssid, have_free_callback) { - ss->free(task); + do_each_subsys_mask(ss, ssid, have_release_callback) { + ss->release(task); } while_each_subsys_mask(); +} +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); put_css_set(cset); } @@ -5996,7 +6055,7 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, type, flags); + ret = __cgroup_bpf_detach(cgrp, prog, type); mutex_unlock(&cgroup_mutex); return ret; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 479743db6c37..6a1942ed781c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -39,6 +39,7 @@ #include <linux/memory.h> #include <linux/export.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> @@ -203,19 +204,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) return css_cs(cs->css.parent); } -#ifdef CONFIG_NUMA -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return task->mempolicy; -} -#else -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return false; -} -#endif - - /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE, @@ -372,25 +360,52 @@ static inline bool is_in_v2_mode(void) * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ -static struct dentry *cpuset_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, void *data) -{ - struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - struct dentry *ret = ERR_PTR(-ENODEV); - if (cgroup_fs) { - char mountopts[] = - "cpuset,noprefix," - "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->mount(cgroup_fs, flags, - unused_dev_name, mountopts); - put_filesystem(cgroup_fs); +static int cpuset_get_tree(struct fs_context *fc) +{ + struct file_system_type *cgroup_fs; + struct fs_context *new_fc; + int ret; + + cgroup_fs = get_fs_type("cgroup"); + if (!cgroup_fs) + return -ENODEV; + + new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags); + if (IS_ERR(new_fc)) { + ret = PTR_ERR(new_fc); + } else { + static const char agent_path[] = "/sbin/cpuset_release_agent"; + ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0); + if (!ret) + ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0); + if (!ret) + ret = vfs_parse_fs_string(new_fc, "release_agent", + agent_path, sizeof(agent_path) - 1); + if (!ret) + ret = vfs_get_tree(new_fc); + if (!ret) { /* steal the result */ + fc->root = new_fc->root; + new_fc->root = NULL; + } + put_fs_context(new_fc); } + put_filesystem(cgroup_fs); return ret; } +static const struct fs_context_operations cpuset_fs_context_ops = { + .get_tree = cpuset_get_tree, +}; + +static int cpuset_init_fs_context(struct fs_context *fc) +{ + fc->ops = &cpuset_fs_context_ops; + return 0; +} + static struct file_system_type cpuset_fs_type = { - .name = "cpuset", - .mount = cpuset_mount, + .name = "cpuset", + .init_fs_context = cpuset_init_fs_context, }; /* @@ -725,11 +740,10 @@ static inline int nr_cpusets(void) * Must be called with cpuset_mutex held. * * The three key local variables below are: - * q - a linked-list queue of cpuset pointers, used to implement a - * top-down scan of all cpusets. This scan loads a pointer - * to each cpuset marked is_sched_load_balance into the - * array 'csa'. For our purposes, rebuilding the schedulers - * sched domains, we can ignore !is_sched_load_balance cpusets. + * cp - cpuset pointer, used (together with pos_css) to perform a + * top-down scan of all cpusets. For our purposes, rebuilding + * the schedulers sched domains, we can ignore !is_sched_load_ + * balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, @@ -760,7 +774,7 @@ static inline int nr_cpusets(void) static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - struct cpuset *cp; /* scans q */ + struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 9829c67ebc0a..c9960baaa14f 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task) pids_uncharge(pids, 1); } -static void pids_free(struct task_struct *task) +static void pids_release(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); @@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .free = pids_free, + .release = pids_release, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, .threaded = true, diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index d3bbb757ee49..1d75ae7f1cb7 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -313,10 +313,8 @@ EXPORT_SYMBOL(rdmacg_try_charge); * If IB stack wish a device to participate in rdma cgroup resource * tracking, it must invoke this API to register with rdma cgroup before * any user space application can start using the RDMA resources. - * Returns 0 on success or EINVAL when table length given is beyond - * supported size. */ -int rdmacg_register_device(struct rdmacg_device *device) +void rdmacg_register_device(struct rdmacg_device *device) { INIT_LIST_HEAD(&device->dev_node); INIT_LIST_HEAD(&device->rpools); @@ -324,7 +322,6 @@ int rdmacg_register_device(struct rdmacg_device *device) mutex_lock(&rdmacg_mutex); list_add_tail(&device->dev_node, &rdmacg_devices); mutex_unlock(&rdmacg_mutex); - return 0; } EXPORT_SYMBOL(rdmacg_register_device); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d503d1a9007c..bb95a35e8c2d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -87,7 +87,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; - struct cgroup *parent; if (pos == root) return NULL; @@ -115,8 +114,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ - parent = cgroup_parent(pos); - if (parent && rstatc->updated_next) { + if (rstatc->updated_next) { + struct cgroup *parent = cgroup_parent(pos); struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); struct cgroup_rstat_cpu *nrstatc; struct cgroup **nextp; @@ -140,9 +139,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * updated stat. */ smp_mb(); + + return pos; } - return pos; + /* only happens for @root */ + return NULL; } /* see cgroup_rstat_flush() */ diff --git a/kernel/compat.c b/kernel/compat.c index f01affa17e22..d8a36c6ad7c9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -20,7 +20,6 @@ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/security.h> -#include <linux/timex.h> #include <linux/export.h> #include <linux/migrate.h> #include <linux/posix-timers.h> @@ -30,69 +29,6 @@ #include <linux/uaccess.h> -int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) -{ - struct compat_timex tx32; - - memset(txc, 0, sizeof(struct timex)); - if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) - return -EFAULT; - - txc->modes = tx32.modes; - txc->offset = tx32.offset; - txc->freq = tx32.freq; - txc->maxerror = tx32.maxerror; - txc->esterror = tx32.esterror; - txc->status = tx32.status; - txc->constant = tx32.constant; - txc->precision = tx32.precision; - txc->tolerance = tx32.tolerance; - txc->time.tv_sec = tx32.time.tv_sec; - txc->time.tv_usec = tx32.time.tv_usec; - txc->tick = tx32.tick; - txc->ppsfreq = tx32.ppsfreq; - txc->jitter = tx32.jitter; - txc->shift = tx32.shift; - txc->stabil = tx32.stabil; - txc->jitcnt = tx32.jitcnt; - txc->calcnt = tx32.calcnt; - txc->errcnt = tx32.errcnt; - txc->stbcnt = tx32.stbcnt; - - return 0; -} - -int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) -{ - struct compat_timex tx32; - - memset(&tx32, 0, sizeof(struct compat_timex)); - tx32.modes = txc->modes; - tx32.offset = txc->offset; - tx32.freq = txc->freq; - tx32.maxerror = txc->maxerror; - tx32.esterror = txc->esterror; - tx32.status = txc->status; - tx32.constant = txc->constant; - tx32.precision = txc->precision; - tx32.tolerance = txc->tolerance; - tx32.time.tv_sec = txc->time.tv_sec; - tx32.time.tv_usec = txc->time.tv_usec; - tx32.tick = txc->tick; - tx32.ppsfreq = txc->ppsfreq; - tx32.jitter = txc->jitter; - tx32.shift = txc->shift; - tx32.stabil = txc->stabil; - tx32.jitcnt = txc->jitcnt; - tx32.calcnt = txc->calcnt; - tx32.errcnt = txc->errcnt; - tx32.stbcnt = txc->stbcnt; - tx32.tai = txc->tai; - if (copy_to_user(utp, &tx32, sizeof(struct compat_timex))) - return -EFAULT; - return 0; -} - static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) { return (!access_ok(ctv, sizeof(*ctv)) || diff --git a/kernel/configs.c b/kernel/configs.c index 2df132b20217..b062425ccf8d 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -30,37 +30,35 @@ #include <linux/init.h> #include <linux/uaccess.h> -/**************************************************/ -/* the actual current config file */ - /* - * Define kernel_config_data and kernel_config_data_size, which contains the - * wrapped and compressed configuration file. The file is first compressed - * with gzip and then bounded by two eight byte magic numbers to allow - * extraction from a binary kernel image: - * - * IKCFG_ST - * <image> - * IKCFG_ED + * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from + * a binary kernel image or a module. See scripts/extract-ikconfig. */ -#define MAGIC_START "IKCFG_ST" -#define MAGIC_END "IKCFG_ED" -#include "config_data.h" - - -#define MAGIC_SIZE (sizeof(MAGIC_START) - 1) -#define kernel_config_data_size \ - (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) +asm ( +" .pushsection .rodata, \"a\" \n" +" .ascii \"IKCFG_ST\" \n" +" .global kernel_config_data \n" +"kernel_config_data: \n" +" .incbin \"kernel/config_data.gz\" \n" +" .global kernel_config_data_end \n" +"kernel_config_data_end: \n" +" .ascii \"IKCFG_ED\" \n" +" .popsection \n" +); #ifdef CONFIG_IKCONFIG_PROC +extern char kernel_config_data; +extern char kernel_config_data_end; + static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) { return simple_read_from_buffer(buf, len, offset, - kernel_config_data + MAGIC_SIZE, - kernel_config_data_size); + &kernel_config_data, + &kernel_config_data_end - + &kernel_config_data); } static const struct file_operations ikconfig_file_ops = { @@ -79,7 +77,7 @@ static int __init ikconfig_init(void) if (!entry) return -ENOMEM; - proc_set_size(entry, kernel_config_data_size); + proc_set_size(entry, &kernel_config_data_end - &kernel_config_data); return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index d1c6d152da89..f2ef10460698 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -9,6 +9,7 @@ #include <linux/notifier.h> #include <linux/sched/signal.h> #include <linux/sched/hotplug.h> +#include <linux/sched/isolation.h> #include <linux/sched/task.h> #include <linux/sched/smt.h> #include <linux/unistd.h> @@ -313,6 +314,15 @@ void cpus_write_unlock(void) void lockdep_assert_cpus_held(void) { + /* + * We can't have hotplug operations before userspace starts running, + * and some init codepaths will knowingly not take the hotplug lock. + * This is all valid, so mute lockdep until it makes sense to report + * unheld locks. + */ + if (system_state < SYSTEM_RUNNING) + return; + percpu_rwsem_assert_held(&cpu_hotplug_lock); } @@ -555,6 +565,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); } +static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st) +{ + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + /* + * When CPU hotplug is disabled, then taking the CPU down is not + * possible because takedown_cpu() and the architecture and + * subsystem specific mechanisms are not available. So the CPU + * which would be completely unplugged again needs to stay around + * in the current state. + */ + return st->state <= CPUHP_BRINGUP_CPU; +} + static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { @@ -565,8 +589,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); if (ret) { - st->target = prev_state; - undo_cpu_up(cpu, st); + if (can_rollback_cpu(st)) { + st->target = prev_state; + undo_cpu_up(cpu, st); + } break; } } @@ -835,6 +861,8 @@ static int take_cpu_down(void *_param) /* Give up timekeeping duties */ tick_handover_do_timer(); + /* Remove CPU from timer broadcasting */ + tick_offline_cpu(cpu); /* Park the stopper thread */ stop_machine_park(cpu); return 0; @@ -1174,8 +1202,15 @@ int freeze_secondary_cpus(int primary) int cpu, error = 0; cpu_maps_update_begin(); - if (!cpu_online(primary)) + if (primary == -1) { primary = cpumask_first(cpu_online_mask); + if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) + primary = housekeeping_any_cpu(HK_FLAG_TIMER); + } else { + if (!cpu_online(primary)) + primary = cpumask_first(cpu_online_mask); + } + /* * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time @@ -2008,19 +2043,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { #ifdef CONFIG_HOTPLUG_SMT -static const char *smt_states[] = { - [CPU_SMT_ENABLED] = "on", - [CPU_SMT_DISABLED] = "off", - [CPU_SMT_FORCE_DISABLED] = "forceoff", - [CPU_SMT_NOT_SUPPORTED] = "notsupported", -}; - -static ssize_t -show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]); -} - static void cpuhp_offline_cpu_device(unsigned int cpu) { struct device *dev = get_cpu_device(cpu); @@ -2091,9 +2113,10 @@ static int cpuhp_smt_enable(void) return ret; } + static ssize_t -store_smt_control(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +__store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { int ctrlval, ret; @@ -2131,14 +2154,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr, unlock_device_hotplug(); return ret ? ret : count; } + +#else /* !CONFIG_HOTPLUG_SMT */ +static ssize_t +__store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + return -ENODEV; +} +#endif /* CONFIG_HOTPLUG_SMT */ + +static const char *smt_states[] = { + [CPU_SMT_ENABLED] = "on", + [CPU_SMT_DISABLED] = "off", + [CPU_SMT_FORCE_DISABLED] = "forceoff", + [CPU_SMT_NOT_SUPPORTED] = "notsupported", + [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented", +}; + +static ssize_t +show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +{ + const char *state = smt_states[cpu_smt_control]; + + return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); +} + +static ssize_t +store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + return __store_smt_control(dev, attr, buf, count); +} static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); static ssize_t show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) { - bool active = topology_max_smt_threads() > 1; - - return snprintf(buf, PAGE_SIZE - 2, "%d\n", active); + return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active()); } static DEVICE_ATTR(active, 0444, show_smt_active, NULL); @@ -2154,21 +2207,17 @@ static const struct attribute_group cpuhp_smt_attr_group = { NULL }; -static int __init cpu_smt_state_init(void) +static int __init cpu_smt_sysfs_init(void) { return sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_smt_attr_group); } -#else -static inline int cpu_smt_state_init(void) { return 0; } -#endif - static int __init cpuhp_sysfs_init(void) { int cpu, ret; - ret = cpu_smt_state_init(); + ret = cpu_smt_sysfs_init(); if (ret) return ret; @@ -2189,7 +2238,7 @@ static int __init cpuhp_sysfs_init(void) return 0; } device_initcall(cpuhp_sysfs_init); -#endif +#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */ /* * cpu_bit_bitmap[] is a special, "compressed" data structure that @@ -2279,3 +2328,18 @@ void __init boot_cpu_hotplug_init(void) #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } + +enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; + +static int __init mitigations_parse_cmdline(char *arg) +{ + if (!strcmp(arg, "off")) + cpu_mitigations = CPU_MITIGATIONS_OFF; + else if (!strcmp(arg, "auto")) + cpu_mitigations = CPU_MITIGATIONS_AUTO; + else if (!strcmp(arg, "auto,nosmt")) + cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; + + return 0; +} +early_param("mitigations", mitigations_parse_cmdline); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 933cb3e45b98..093c9f917ed0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif arch_crash_save_vmcoreinfo(); diff --git a/kernel/cred.c b/kernel/cred.c index 21f4a97085b4..45d77284aed0 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -760,19 +760,6 @@ bool creds_are_invalid(const struct cred *cred) { if (cred->magic != CRED_MAGIC) return true; -#ifdef CONFIG_SECURITY_SELINUX - /* - * cred->security == NULL if security_cred_alloc_blank() or - * security_prepare_creds() returned an error. - */ - if (selinux_is_enabled() && cred->security) { - if ((unsigned long) cred->security < PAGE_SIZE) - return true; - if ((*(u32 *)cred->security & 0xffffff00) == - (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) - return true; - } -#endif return false; } EXPORT_SYMBOL(creds_are_invalid); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index ca88b867e7fe..a06ba3013b3b 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -16,7 +16,16 @@ config ARCH_DMA_ADDR_T_64BIT config ARCH_HAS_DMA_COHERENCE_H bool -config HAVE_GENERIC_DMA_COHERENT +config ARCH_HAS_DMA_SET_MASK + bool + +config DMA_DECLARE_COHERENT + bool + +config ARCH_HAS_SETUP_DMA_OPS + bool + +config ARCH_HAS_TEARDOWN_DMA_OPS bool config ARCH_HAS_SYNC_DMA_FOR_DEVICE @@ -53,3 +62,116 @@ config DMA_REMAP config DMA_DIRECT_REMAP bool select DMA_REMAP + +config DMA_CMA + bool "DMA Contiguous Memory Allocator" + depends on HAVE_DMA_CONTIGUOUS && CMA + help + This enables the Contiguous Memory Allocator which allows drivers + to allocate big physically-contiguous blocks of memory for use with + hardware components that do not support I/O map nor scatter-gather. + + You can disable CMA by specifying "cma=0" on the kernel's command + line. + + For more information see <include/linux/dma-contiguous.h>. + If unsure, say "n". + +if DMA_CMA +comment "Default contiguous memory area size:" + +config CMA_SIZE_MBYTES + int "Size in Mega Bytes" + depends on !CMA_SIZE_SEL_PERCENTAGE + default 0 if X86 + default 16 + help + Defines the size (in MiB) of the default memory area for Contiguous + Memory Allocator. If the size of 0 is selected, CMA is disabled by + default, but it can be enabled by passing cma=size[MG] to the kernel. + + +config CMA_SIZE_PERCENTAGE + int "Percentage of total memory" + depends on !CMA_SIZE_SEL_MBYTES + default 0 if X86 + default 10 + help + Defines the size of the default memory area for Contiguous Memory + Allocator as a percentage of the total memory in the system. + If 0 percent is selected, CMA is disabled by default, but it can be + enabled by passing cma=size[MG] to the kernel. + +choice + prompt "Selected region size" + default CMA_SIZE_SEL_MBYTES + +config CMA_SIZE_SEL_MBYTES + bool "Use mega bytes value only" + +config CMA_SIZE_SEL_PERCENTAGE + bool "Use percentage value only" + +config CMA_SIZE_SEL_MIN + bool "Use lower value (minimum)" + +config CMA_SIZE_SEL_MAX + bool "Use higher value (maximum)" + +endchoice + +config CMA_ALIGNMENT + int "Maximum PAGE_SIZE order of alignment for contiguous buffers" + range 4 12 + default 8 + help + DMA mapping framework by default aligns all buffers to the smallest + PAGE_SIZE order which is greater than or equal to the requested buffer + size. This works well for buffers up to a few hundreds kilobytes, but + for larger buffers it just a memory waste. With this parameter you can + specify the maximum PAGE_SIZE order for contiguous buffers. Larger + buffers will be aligned only to this specified order. The order is + expressed as a power of two multiplied by the PAGE_SIZE. + + For example, if your system defaults to 4KiB pages, the order value + of 8 means that the buffers will be aligned up to 1MiB only. + + If unsure, leave the default value "8". + +endif + +config DMA_API_DEBUG + bool "Enable debugging of DMA-API usage" + select NEED_DMA_MAP_STATE + help + Enable this option to debug the use of the DMA API by device drivers. + With this option you will be able to detect common bugs in device + drivers like double-freeing of DMA mappings or freeing mappings that + were never allocated. + + This also attempts to catch cases where a page owned by DMA is + accessed by the cpu in a way that could cause data corruption. For + example, this enables cow_user_page() to check that the source page is + not undergoing DMA. + + This option causes a performance degradation. Use only if you want to + debug device drivers and dma interactions. + + If unsure, say N. + +config DMA_API_DEBUG_SG + bool "Debug DMA scatter-gather usage" + default y + depends on DMA_API_DEBUG + help + Perform extra checking that callers of dma_map_sg() have respected the + appropriate segment length/boundary limits for the given device when + preparing DMA scatterlists. + + This is particularly likely to have been overlooked in cases where the + dma_map_sg() API is used for general bulk mapping of pages rather than + preparing literal scatter-gather descriptors, where there is a risk of + unexpected behaviour from DMA API implementations if the scatterlist + is technically out-of-spec. + + If unsure, say N. diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 72ff6e46aa86..d237cf3dc181 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o +obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 66f0fb7e9a3a..29fd6590dc1e 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -14,7 +14,6 @@ struct dma_coherent_mem { dma_addr_t device_base; unsigned long pfn_base; int size; - int flags; unsigned long *bitmap; spinlock_t spinlock; bool use_dev_dma_pfn_offset; @@ -38,12 +37,12 @@ static inline dma_addr_t dma_get_device_base(struct device *dev, return mem->device_base; } -static int dma_init_coherent_memory( - phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags, - struct dma_coherent_mem **mem) +static int dma_init_coherent_memory(phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, + struct dma_coherent_mem **mem) { struct dma_coherent_mem *dma_mem = NULL; - void __iomem *mem_base = NULL; + void *mem_base = NULL; int pages = size >> PAGE_SHIFT; int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); int ret; @@ -73,7 +72,6 @@ static int dma_init_coherent_memory( dma_mem->device_base = device_addr; dma_mem->pfn_base = PFN_DOWN(phys_addr); dma_mem->size = pages; - dma_mem->flags = flags; spin_lock_init(&dma_mem->spinlock); *mem = dma_mem; @@ -110,12 +108,12 @@ static int dma_assign_coherent_memory(struct device *dev, } int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags) + dma_addr_t device_addr, size_t size) { struct dma_coherent_mem *mem; int ret; - ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem); + ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem); if (ret) return ret; @@ -137,29 +135,6 @@ void dma_release_declared_memory(struct device *dev) } EXPORT_SYMBOL(dma_release_declared_memory); -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - unsigned long flags; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - spin_lock_irqsave(&mem->spinlock, flags); - pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem)); - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - spin_unlock_irqrestore(&mem->spinlock, flags); - - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) { @@ -213,15 +188,7 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, return 0; *ret = __dma_alloc_from_coherent(mem, size, dma_handle); - if (*ret) - return 1; - - /* - * In the case where the allocation can not be satisfied from the - * per-device area, try to fall back to generic memory if the - * constraints allow it. - */ - return mem->flags & DMA_MEMORY_EXCLUSIVE; + return 1; } void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) @@ -350,8 +317,7 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) if (!mem) { ret = dma_init_coherent_memory(rmem->base, rmem->base, - rmem->size, - DMA_MEMORY_EXCLUSIVE, &mem); + rmem->size, &mem); if (ret) { pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 23cf5361bcf1..badd77670d00 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -89,8 +89,8 @@ struct dma_debug_entry { int sg_mapped_ents; enum map_err_types map_err_type; #ifdef CONFIG_STACKTRACE - struct stack_trace stacktrace; - unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; + unsigned int stack_len; + unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; #endif }; @@ -134,17 +134,6 @@ static u32 nr_total_entries; /* number of preallocated entries requested by kernel cmdline */ static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; -/* debugfs dentry's for the stuff above */ -static struct dentry *dma_debug_dent __read_mostly; -static struct dentry *global_disable_dent __read_mostly; -static struct dentry *error_count_dent __read_mostly; -static struct dentry *show_all_errors_dent __read_mostly; -static struct dentry *show_num_errors_dent __read_mostly; -static struct dentry *num_free_entries_dent __read_mostly; -static struct dentry *min_free_entries_dent __read_mostly; -static struct dentry *nr_total_entries_dent __read_mostly; -static struct dentry *filter_dent __read_mostly; - /* per-driver filter related state */ #define NAME_MAX_LEN 64 @@ -185,7 +174,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) #ifdef CONFIG_STACKTRACE if (entry) { pr_warning("Mapped at:\n"); - print_stack_trace(&entry->stacktrace, 0); + stack_trace_print(entry->stack_entries, entry->stack_len, 0); } #endif } @@ -715,12 +704,10 @@ static struct dma_debug_entry *dma_entry_alloc(void) spin_unlock_irqrestore(&free_entries_lock, flags); #ifdef CONFIG_STACKTRACE - entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; - entry->stacktrace.entries = entry->st_entries; - entry->stacktrace.skip = 2; - save_stack_trace(&entry->stacktrace); + entry->stack_len = stack_trace_save(entry->stack_entries, + ARRAY_SIZE(entry->stack_entries), + 1); #endif - return entry; } @@ -840,66 +827,46 @@ static const struct file_operations filter_fops = { .llseek = default_llseek, }; -static int dma_debug_fs_init(void) +static int dump_show(struct seq_file *seq, void *v) { - dma_debug_dent = debugfs_create_dir("dma-api", NULL); - if (!dma_debug_dent) { - pr_err("can not create debugfs directory\n"); - return -ENOMEM; - } + int idx; - global_disable_dent = debugfs_create_bool("disabled", 0444, - dma_debug_dent, - &global_disable); - if (!global_disable_dent) - goto out_err; - - error_count_dent = debugfs_create_u32("error_count", 0444, - dma_debug_dent, &error_count); - if (!error_count_dent) - goto out_err; - - show_all_errors_dent = debugfs_create_u32("all_errors", 0644, - dma_debug_dent, - &show_all_errors); - if (!show_all_errors_dent) - goto out_err; - - show_num_errors_dent = debugfs_create_u32("num_errors", 0644, - dma_debug_dent, - &show_num_errors); - if (!show_num_errors_dent) - goto out_err; - - num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444, - dma_debug_dent, - &num_free_entries); - if (!num_free_entries_dent) - goto out_err; - - min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444, - dma_debug_dent, - &min_free_entries); - if (!min_free_entries_dent) - goto out_err; - - nr_total_entries_dent = debugfs_create_u32("nr_total_entries", 0444, - dma_debug_dent, - &nr_total_entries); - if (!nr_total_entries_dent) - goto out_err; - - filter_dent = debugfs_create_file("driver_filter", 0644, - dma_debug_dent, NULL, &filter_fops); - if (!filter_dent) - goto out_err; + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + spin_lock_irqsave(&bucket->lock, flags); + list_for_each_entry(entry, &bucket->list, list) { + seq_printf(seq, + "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n", + dev_name(entry->dev), + dev_driver_string(entry->dev), + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + spin_unlock_irqrestore(&bucket->lock, flags); + } return 0; +} +DEFINE_SHOW_ATTRIBUTE(dump); -out_err: - debugfs_remove_recursive(dma_debug_dent); - - return -ENOMEM; +static void dma_debug_fs_init(void) +{ + struct dentry *dentry = debugfs_create_dir("dma-api", NULL); + + debugfs_create_bool("disabled", 0444, dentry, &global_disable); + debugfs_create_u32("error_count", 0444, dentry, &error_count); + debugfs_create_u32("all_errors", 0644, dentry, &show_all_errors); + debugfs_create_u32("num_errors", 0644, dentry, &show_num_errors); + debugfs_create_u32("num_free_entries", 0444, dentry, &num_free_entries); + debugfs_create_u32("min_free_entries", 0444, dentry, &min_free_entries); + debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries); + debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops); + debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops); } static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) @@ -985,12 +952,7 @@ static int dma_debug_init(void) spin_lock_init(&dma_entry_hash[i].lock); } - if (dma_debug_fs_init() != 0) { - pr_err("error creating debugfs entries - disabling\n"); - global_disable = true; - - return 0; - } + dma_debug_fs_init(); nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES); for (i = 0; i < nr_pages; ++i) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 355d16acee6d..fcdb23e8d2fc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -132,8 +132,7 @@ again: goto again; } - if (IS_ENABLED(CONFIG_ZONE_DMA) && - phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { + if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } @@ -356,6 +355,20 @@ out_unmap: } EXPORT_SYMBOL(dma_direct_map_sg); +dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + dma_addr_t dma_addr = paddr; + + if (unlikely(!dma_direct_possible(dev, dma_addr, size))) { + report_addr(dev, dma_addr, size); + return DMA_MAPPING_ERROR; + } + + return dma_addr; +} +EXPORT_SYMBOL(dma_direct_map_resource); + /* * Because 32-bit DMA masks are so common we expect every architecture to be * able to satisfy them - either by not supporting more physical memory, or by @@ -380,3 +393,14 @@ int dma_direct_supported(struct device *dev, u64 mask) */ return mask >= __phys_to_dma(dev, min_mask); } + +size_t dma_direct_max_mapping_size(struct device *dev) +{ + size_t size = SIZE_MAX; + + /* If SWIOTLB is active, use its maximum mapping size */ + if (is_swiotlb_active()) + size = swiotlb_max_mapping_size(dev); + + return size; +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index a11006b6d8e8..c000906348c9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -207,7 +207,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL(dma_mmap_attrs); -#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK static u64 dma_default_get_required_mask(struct device *dev) { u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); @@ -238,7 +237,6 @@ u64 dma_get_required_mask(struct device *dev) return dma_default_get_required_mask(dev); } EXPORT_SYMBOL_GPL(dma_get_required_mask); -#endif #ifndef arch_dma_alloc_attrs #define arch_dma_alloc_attrs(dev) (true) @@ -318,18 +316,23 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); -#ifndef HAVE_ARCH_DMA_SET_MASK +#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK +void arch_dma_set_mask(struct device *dev, u64 mask); +#else +#define arch_dma_set_mask(dev, mask) do { } while (0) +#endif + int dma_set_mask(struct device *dev, u64 mask) { if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; + arch_dma_set_mask(dev, mask); dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } EXPORT_SYMBOL(dma_set_mask); -#endif #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) @@ -357,3 +360,17 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, ops->cache_sync(dev, vaddr, size, dir); } EXPORT_SYMBOL(dma_cache_sync); + +size_t dma_max_mapping_size(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + size_t size = SIZE_MAX; + + if (dma_is_direct(ops)) + size = dma_direct_max_mapping_size(dev); + else if (ops && ops->max_mapping_size) + size = ops->max_mapping_size(dev); + + return size; +} +EXPORT_SYMBOL_GPL(dma_max_mapping_size); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 38d57218809c..6f7619c1f877 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -199,6 +199,7 @@ void __init swiotlb_update_mem_attributes(void) int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { unsigned long i, bytes; + size_t alloc_size; bytes = nslabs << IO_TLB_SHIFT; @@ -211,12 +212,18 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = memblock_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), - PAGE_SIZE); - io_tlb_orig_addr = memblock_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), - PAGE_SIZE); + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int)); + io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_list) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)); + io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_orig_addr) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + for (i = 0; i < io_tlb_nslabs; i++) { io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; @@ -249,7 +256,7 @@ swiotlb_init(int verbose) bytes = io_tlb_nslabs << IO_TLB_SHIFT; /* Get IO TLB memory from the low pages */ - vstart = memblock_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); + vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE); if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; @@ -670,16 +677,18 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, return true; } -/* - * Return whether the given device DMA address mask can be supported - * properly. For example, if your device can only drive the low 24-bits - * during bus mastering, then you would pass 0x00ffffff as the mask to - * this function. - */ -int -swiotlb_dma_supported(struct device *hwdev, u64 mask) +size_t swiotlb_max_mapping_size(struct device *dev) +{ + return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE; +} + +bool is_swiotlb_active(void) { - return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; + /* + * When SWIOTLB is initialized, even if io_tlb_start points to physical + * address zero, io_tlb_end surely doesn't. + */ + return io_tlb_end != 0; } #ifdef CONFIG_DEBUG_FS diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 24a77c34e9ad..c2b41a263166 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING */ #include <linux/perf_event.h> diff --git a/kernel/events/core.c b/kernel/events/core.c index e5ede6918050..abbd4b3b96c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING */ #include <linux/fs.h> @@ -385,6 +384,8 @@ static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; +static atomic_t nr_ksymbol_events __read_mostly; +static atomic_t nr_bpf_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -1171,7 +1172,7 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx) static void get_ctx(struct perf_event_context *ctx) { - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); + refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) @@ -1185,7 +1186,7 @@ static void free_ctx(struct rcu_head *head) static void put_ctx(struct perf_event_context *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) { + if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) @@ -1254,6 +1255,7 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem + * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock @@ -1267,7 +1269,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting) again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); - if (!atomic_inc_not_zero(&ctx->refcount)) { + if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } @@ -1400,7 +1402,7 @@ retry: } if (ctx->task == TASK_TOMBSTONE || - !atomic_inc_not_zero(&ctx->refcount)) { + !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { @@ -2007,8 +2009,8 @@ event_sched_out(struct perf_event *event, event->pmu->del(event, 0); event->oncpu = -1; - if (event->pending_disable) { - event->pending_disable = 0; + if (READ_ONCE(event->pending_disable) >= 0) { + WRITE_ONCE(event->pending_disable, -1); state = PERF_EVENT_STATE_OFF; } perf_event_set_state(event, state); @@ -2196,7 +2198,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { - event->pending_disable = 1; + WRITE_ONCE(event->pending_disable, smp_processor_id()); + /* can fail, see perf_pending_event_disable() */ irq_work_queue(&event->pending); } @@ -2475,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, perf_pmu_enable(cpuctx->ctx.pmu); } +void perf_pmu_resched(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + + perf_ctx_lock(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); + perf_ctx_unlock(cpuctx, task_ctx); +} + /* * Cross CPU call to install and enable a performance event * @@ -2797,7 +2810,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in - * event::addr_filters_offs array and bump the event::addr_filters_gen; + * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. @@ -4056,7 +4069,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->event_list); INIT_LIST_HEAD(&ctx->pinned_active); INIT_LIST_HEAD(&ctx->flexible_active); - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); } static struct perf_event_context * @@ -4235,8 +4248,9 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || - attr->task || - attr->context_switch) + attr->task || attr->ksymbol || + attr->context_switch || + attr->bpf_event) return true; return false; } @@ -4305,6 +4319,10 @@ static void unaccount_event(struct perf_event *event) dec = true; if (has_branch_stack(event)) dec = true; + if (event->attr.ksymbol) + atomic_dec(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_dec(&nr_bpf_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -4440,7 +4458,7 @@ static void _free_event(struct perf_event *event) perf_event_free_bpf_prog(event); perf_addr_filters_splice(event, NULL); - kfree(event->addr_filters_offs); + kfree(event->addr_filter_ranges); if (event->destroy) event->destroy(event); @@ -4963,6 +4981,11 @@ static void __perf_event_period(struct perf_event *event, } } +static int perf_event_check_period(struct perf_event *event, u64 value) +{ + return event->pmu->check_period(event, value); +} + static int perf_event_period(struct perf_event *event, u64 __user *arg) { u64 value; @@ -4979,6 +5002,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; + if (perf_event_check_period(event, value)) + return -EINVAL; + event_function_call(event, __perf_event_period, &value); return 0; @@ -5388,7 +5414,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { - if (!atomic_inc_not_zero(&rb->refcount)) + if (!refcount_inc_not_zero(&rb->refcount)) rb = NULL; } rcu_read_unlock(); @@ -5398,7 +5424,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) void ring_buffer_put(struct ring_buffer *rb) { - if (!atomic_dec_and_test(&rb->refcount)) + if (!refcount_dec_and_test(&rb->refcount)) return; WARN_ON_ONCE(!list_empty(&rb->event_list)); @@ -5459,11 +5485,11 @@ static void perf_mmap_close(struct vm_area_struct *vma) /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); - WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); mutex_unlock(&event->mmap_mutex); } @@ -5532,7 +5558,7 @@ again: */ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; + atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: @@ -5680,7 +5706,7 @@ accounting: lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->pinned_vm + extra; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -5721,7 +5747,7 @@ accounting: unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { @@ -5795,10 +5821,45 @@ void perf_event_wakeup(struct perf_event *event) } } +static void perf_pending_event_disable(struct perf_event *event) +{ + int cpu = READ_ONCE(event->pending_disable); + + if (cpu < 0) + return; + + if (cpu == smp_processor_id()) { + WRITE_ONCE(event->pending_disable, -1); + perf_event_disable_local(event); + return; + } + + /* + * CPU-A CPU-B + * + * perf_event_disable_inatomic() + * @pending_disable = CPU-A; + * irq_work_queue(); + * + * sched-out + * @pending_disable = -1; + * + * sched-in + * perf_event_disable_inatomic() + * @pending_disable = CPU-B; + * irq_work_queue(); // FAILS + * + * irq_work_run() + * perf_pending_event() + * + * But the event runs on CPU-B and wants disabling there. + */ + irq_work_queue_on(&event->pending, cpu); +} + static void perf_pending_event(struct irq_work *entry) { - struct perf_event *event = container_of(entry, - struct perf_event, pending); + struct perf_event *event = container_of(entry, struct perf_event, pending); int rctx; rctx = perf_swevent_get_recursion_context(); @@ -5807,10 +5868,7 @@ static void perf_pending_event(struct irq_work *entry) * and we won't recurse 'further'. */ - if (event->pending_disable) { - event->pending_disable = 0; - perf_event_disable_local(event); - } + perf_pending_event_disable(event); if (event->pending_wakeup) { event->pending_wakeup = 0; @@ -6489,7 +6547,7 @@ void perf_prepare_sample(struct perf_event_header *header, data->phys_addr = perf_virt_to_phys(data->addr); } -static __always_inline void +static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, @@ -6499,13 +6557,15 @@ __perf_event_output(struct perf_event *event, { struct perf_output_handle handle; struct perf_event_header header; + int err; /* protect the callchain buffers */ rcu_read_lock(); perf_prepare_sample(&header, data, event, regs); - if (output_begin(&handle, event, header.size)) + err = output_begin(&handle, event, header.size); + if (err) goto exit; perf_output_sample(&handle, &header, data, event); @@ -6514,6 +6574,7 @@ __perf_event_output(struct perf_event *event, exit: rcu_read_unlock(); + return err; } void @@ -6532,12 +6593,12 @@ perf_event_output_backward(struct perf_event *event, __perf_event_output(event, data, regs, perf_output_begin_backward); } -void +int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - __perf_event_output(event, data, regs, perf_output_begin); + return __perf_event_output(event, data, regs, perf_output_begin); } /* @@ -6678,7 +6739,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { - event->addr_filters_offs[count] = 0; + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; restart++; } @@ -7170,6 +7232,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; + u32 type = mmap_event->event_id.header.type; int ret; if (!perf_event_mmap_match(event, data)) @@ -7213,6 +7276,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_end(&handle); out: mmap_event->event_id.header.size = size; + mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -7358,28 +7422,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, return true; } +static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, + struct vm_area_struct *vma, + struct perf_addr_filter_range *fr) +{ + unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct file *file = vma->vm_file; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + return false; + + if (filter->offset < off) { + fr->start = vma->vm_start; + fr->size = min(vma_size, filter->size - (off - filter->offset)); + } else { + fr->start = vma->vm_start + filter->offset - off; + fr->size = min(vma->vm_end - fr->start, filter->size); + } + + return true; +} + static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; - struct file *file = vma->vm_file; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; + unsigned long flags; if (!has_addr_filter(event)) return; - if (!file) + if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - if (perf_addr_filter_match(filter, file, off, - vma->vm_end - vma->vm_start)) { - event->addr_filters_offs[count] = vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, + &event->addr_filter_ranges[count])) restart++; - } count++; } @@ -7650,6 +7733,207 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +/* + * ksymbol register/unregister tracking + */ + +struct perf_ksymbol_event { + const char *name; + int name_len; + struct { + struct perf_event_header header; + u64 addr; + u32 len; + u16 ksym_type; + u16 flags; + } event_id; +}; + +static int perf_event_ksymbol_match(struct perf_event *event) +{ + return event->attr.ksymbol; +} + +static void perf_event_ksymbol_output(struct perf_event *event, void *data) +{ + struct perf_ksymbol_event *ksymbol_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_ksymbol_match(event)) + return; + + perf_event_header__init_id(&ksymbol_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + ksymbol_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, ksymbol_event->event_id); + __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, + const char *sym) +{ + struct perf_ksymbol_event ksymbol_event; + char name[KSYM_NAME_LEN]; + u16 flags = 0; + int name_len; + + if (!atomic_read(&nr_ksymbol_events)) + return; + + if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || + ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) + goto err; + + strlcpy(name, sym, KSYM_NAME_LEN); + name_len = strlen(name) + 1; + while (!IS_ALIGNED(name_len, sizeof(u64))) + name[name_len++] = '\0'; + BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); + + if (unregister) + flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; + + ksymbol_event = (struct perf_ksymbol_event){ + .name = name, + .name_len = name_len, + .event_id = { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = sizeof(ksymbol_event.event_id) + + name_len, + }, + .addr = addr, + .len = len, + .ksym_type = ksym_type, + .flags = flags, + }, + }; + + perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); + return; +err: + WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); +} + +/* + * bpf program load/unload tracking + */ + +struct perf_bpf_event { + struct bpf_prog *prog; + struct { + struct perf_event_header header; + u16 type; + u16 flags; + u32 id; + u8 tag[BPF_TAG_SIZE]; + } event_id; +}; + +static int perf_event_bpf_match(struct perf_event *event) +{ + return event->attr.bpf_event; +} + +static void perf_event_bpf_output(struct perf_event *event, void *data) +{ + struct perf_bpf_event *bpf_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_bpf_match(event)) + return; + + perf_event_header__init_id(&bpf_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + bpf_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, bpf_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, + enum perf_bpf_event_type type) +{ + bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; + char sym[KSYM_NAME_LEN]; + int i; + + if (prog->aux->func_cnt == 0) { + bpf_get_prog_name(prog, sym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, sym); + } else { + for (i = 0; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + bpf_get_prog_name(subprog, sym); + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, sym); + } + } +} + +void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) +{ + struct perf_bpf_event bpf_event; + + if (type <= PERF_BPF_EVENT_UNKNOWN || + type >= PERF_BPF_EVENT_MAX) + return; + + switch (type) { + case PERF_BPF_EVENT_PROG_LOAD: + case PERF_BPF_EVENT_PROG_UNLOAD: + if (atomic_read(&nr_ksymbol_events)) + perf_event_bpf_emit_ksymbols(prog, type); + break; + default: + break; + } + + if (!atomic_read(&nr_bpf_events)) + return; + + bpf_event = (struct perf_bpf_event){ + .prog = prog, + .event_id = { + .header = { + .type = PERF_RECORD_BPF_EVENT, + .size = sizeof(bpf_event.event_id), + }, + .type = type, + .flags = flags, + .id = prog->aux->id, + }, + }; + + BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); + + memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); + perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -8768,26 +9052,19 @@ static void perf_addr_filters_splice(struct perf_event *event, * @filter; if so, adjust filter's address range. * Called with mm::mmap_sem down for reading. */ -static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, - struct mm_struct *mm) +static void perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm, + struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; for (vma = mm->mmap; vma; vma = vma->vm_next) { - struct file *file = vma->vm_file; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - unsigned long vma_size = vma->vm_end - vma->vm_start; - - if (!file) - continue; - - if (!perf_addr_filter_match(filter, file, off, vma_size)) + if (!vma->vm_file) continue; - return vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, fr)) + return; } - - return 0; } /* @@ -8810,26 +9087,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (task == TASK_TOMBSTONE) return; - if (!ifh->nr_file_filters) - return; - - mm = get_task_mm(event->ctx->task); - if (!mm) - goto restart; + if (ifh->nr_file_filters) { + mm = get_task_mm(event->ctx->task); + if (!mm) + goto restart; - down_read(&mm->mmap_sem); + down_read(&mm->mmap_sem); + } raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - event->addr_filters_offs[count] = 0; + if (filter->path.dentry) { + /* + * Adjust base offset if the filter is associated to a + * binary that needs to be mapped: + */ + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; - /* - * Adjust base offset if the filter is associated to a binary - * that needs to be mapped: - */ - if (filter->path.dentry) - event->addr_filters_offs[count] = - perf_addr_filter_apply(filter, mm); + perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); + } else { + event->addr_filter_ranges[count].start = filter->offset; + event->addr_filter_ranges[count].size = filter->size; + } count++; } @@ -8837,9 +9117,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); - up_read(&mm->mmap_sem); + if (ifh->nr_file_filters) { + up_read(&mm->mmap_sem); - mmput(mm); + mmput(mm); + } restart: perf_event_stop(event, 1); @@ -8943,6 +9225,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; + /* fall through */ case IF_SRC_FILEADDR: case IF_SRC_FILE: @@ -9391,6 +9674,11 @@ static int perf_pmu_nop_int(struct pmu *pmu) return 0; } +static int perf_event_nop_int(struct perf_event *event, u64 value) +{ + return 0; +} + static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) @@ -9691,6 +9979,9 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->check_period) + pmu->check_period = perf_event_nop_int; + if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; @@ -9772,6 +10063,15 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); + if (!ret) { + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) { + if (event->destroy) + event->destroy(event); + ret = -EINVAL; + } + } + if (ret) module_put(pmu->module); @@ -9900,6 +10200,10 @@ static void account_event(struct perf_event *event) inc = true; if (is_cgroup_event(event)) inc = true; + if (event->attr.ksymbol) + atomic_inc(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_inc(&nr_bpf_events); if (inc) { /* @@ -9980,6 +10284,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); + event->pending_disable = -1; init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); @@ -10082,14 +10387,28 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_pmu; if (has_addr_filter(event)) { - event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, - sizeof(unsigned long), - GFP_KERNEL); - if (!event->addr_filters_offs) { + event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, + sizeof(struct perf_addr_filter_range), + GFP_KERNEL); + if (!event->addr_filter_ranges) { err = -ENOMEM; goto err_per_task; } + /* + * Clone the parent's vma offsets: they are valid until exec() + * even if the mm is not shared with the parent. + */ + if (event->parent) { + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + + raw_spin_lock_irq(&ifh->lock); + memcpy(event->addr_filter_ranges, + event->parent->addr_filter_ranges, + pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); + raw_spin_unlock_irq(&ifh->lock); + } + /* force hw sync on the address filters */ event->addr_filters_gen = 1; } @@ -10108,7 +10427,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, return event; err_addr_filters: - kfree(event->addr_filters_offs); + kfree(event->addr_filter_ranges); err_per_task: exclusive_event_destroy(event); @@ -10391,7 +10710,7 @@ __perf_event_ctx_lock_double(struct perf_event *group_leader, again: rcu_read_lock(); gctx = READ_ONCE(group_leader->ctx); - if (!atomic_inc_not_zero(&gctx->refcount)) { + if (!refcount_inc_not_zero(&gctx->refcount)) { rcu_read_unlock(); goto again; } @@ -11608,7 +11927,7 @@ static void __init perf_event_init_all_cpus(void) } } -void perf_swevent_init_cpu(unsigned int cpu) +static void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 5befb338a18d..c5cd852fe86b 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -1,18 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0+ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) 2007 Alan Stern * Copyright (C) IBM Corporation, 2009 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 6dc725a7e7bc..79c47076700a 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -4,13 +4,14 @@ #include <linux/hardirq.h> #include <linux/uaccess.h> +#include <linux/refcount.h> /* Buffer handling */ #define RING_BUFFER_WRITABLE 0x01 struct ring_buffer { - atomic_t refcount; + refcount_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; @@ -48,7 +49,7 @@ struct ring_buffer { atomic_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); - atomic_t aux_refcount; + refcount_t aux_refcount; void **aux_pages; void *aux_priv; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 309ef5a64af5..674b35383491 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events ring-buffer code: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING */ #include <linux/perf_event.h> @@ -285,7 +284,7 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) else rb->overwrite = 1; - atomic_set(&rb->refcount, 1); + refcount_set(&rb->refcount, 1); INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); @@ -358,7 +357,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!atomic_read(&rb->aux_mmap_count)) goto err; - if (!atomic_inc_not_zero(&rb->aux_refcount)) + if (!refcount_inc_not_zero(&rb->aux_refcount)) goto err; /* @@ -393,7 +392,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ - event->pending_disable = 1; + event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); local_set(&rb->aux_nest, 0); goto err_put; @@ -456,24 +455,21 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; } - if (size || handle->aux_flags) { - /* - * Only send RECORD_AUX if we have something useful to communicate - * - * Note: the OVERWRITE records by themselves are not considered - * useful, as they don't communicate any *new* information, - * aside from the short-lived offset, that becomes history at - * the next event sched-in and therefore isn't useful. - * The userspace that needs to copy out AUX data in overwrite - * mode should know to use user_page::aux_head for the actual - * offset. So, from now on we don't output AUX records that - * have *only* OVERWRITE flag set. - */ - - if (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE) - perf_event_aux_event(handle->event, aux_head, size, - handle->aux_flags); - } + /* + * Only send RECORD_AUX if we have something useful to communicate + * + * Note: the OVERWRITE records by themselves are not considered + * useful, as they don't communicate any *new* information, + * aside from the short-lived offset, that becomes history at + * the next event sched-in and therefore isn't useful. + * The userspace that needs to copy out AUX data in overwrite + * mode should know to use user_page::aux_head for the actual + * offset. So, from now on we don't output AUX records that + * have *only* OVERWRITE flag set. + */ + if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)) + perf_event_aux_event(handle->event, aux_head, size, + handle->aux_flags); rb->user_page->aux_head = rb->aux_head; if (rb_need_aux_wakeup(rb)) @@ -481,7 +477,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) - handle->event->pending_disable = 1; + handle->event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); } @@ -599,29 +595,26 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); - int ret = -ENOMEM, max_order = 0; + int ret = -ENOMEM, max_order; if (!has_aux(event)) return -EOPNOTSUPP; - if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { - /* - * We need to start with the max_order that fits in nr_pages, - * not the other way around, hence ilog2() and not get_order. - */ - max_order = ilog2(nr_pages); + /* + * We need to start with the max_order that fits in nr_pages, + * not the other way around, hence ilog2() and not get_order. + */ + max_order = ilog2(nr_pages); - /* - * PMU requests more than one contiguous chunks of memory - * for SW double buffering - */ - if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && - !overwrite) { - if (!max_order) - return -EINVAL; + /* + * PMU requests more than one contiguous chunks of memory + * for SW double buffering + */ + if (!overwrite) { + if (!max_order) + return -EINVAL; - max_order--; - } + max_order--; } rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, @@ -658,7 +651,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, goto out; } - rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) goto out; @@ -671,7 +664,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, * we keep a refcount here to make sure either of the two can * reference them safely. */ - atomic_set(&rb->aux_refcount, 1); + refcount_set(&rb->aux_refcount, 1); rb->aux_overwrite = overwrite; rb->aux_watermark = watermark; @@ -690,7 +683,7 @@ out: void rb_free_aux(struct ring_buffer *rb) { - if (atomic_dec_and_test(&rb->aux_refcount)) + if (refcount_dec_and_test(&rb->aux_refcount)) __rb_free_aux(rb); } @@ -734,7 +727,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct ring_buffer); size += nr_pages * sizeof(void *); - if (order_base_2(size) >= MAX_ORDER) + if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) goto fail; rb = kzalloc(size, GFP_KERNEL); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8aef47ee7bfa..4ca7364c956d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * User-space Probes (UProbes) * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) IBM Corporation, 2008-2012 * Authors: * Srikar Dronamraju @@ -66,7 +53,7 @@ static struct percpu_rw_semaphore dup_mmap_sem; struct uprobe { struct rb_node rb_node; /* node in the rb tree */ - atomic_t ref; + refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; @@ -560,13 +547,13 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v static struct uprobe *get_uprobe(struct uprobe *uprobe) { - atomic_inc(&uprobe->ref); + refcount_inc(&uprobe->ref); return uprobe; } static void put_uprobe(struct uprobe *uprobe) { - if (atomic_dec_and_test(&uprobe->ref)) { + if (refcount_dec_and_test(&uprobe->ref)) { /* * If application munmap(exec_vma) before uprobe_unregister() * gets called, we don't get a chance to remove uprobe from @@ -657,7 +644,7 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) rb_link_node(&uprobe->rb_node, parent, p); rb_insert_color(&uprobe->rb_node, &uprobes_tree); /* get access + creation ref */ - atomic_set(&uprobe->ref, 2); + refcount_set(&uprobe->ref, 2); return u; } @@ -2041,7 +2028,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) if (uc->handler) { rc = uc->handler(uc, regs); WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); + "bad rc=0x%x from %ps()\n", rc, uc->handler); } if (uc->ret_handler) @@ -2307,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = { .priority = INT_MAX-1, /* notified after kprobes, kgdb */ }; -static int __init init_uprobes(void) +void __init uprobes_init(void) { int i; for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - if (percpu_init_rwsem(&dup_mmap_sem)) - return -ENOMEM; + BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); - return register_die_notifier(&uprobe_exception_nb); + BUG_ON(register_die_notifier(&uprobe_exception_nb)); } -__initcall(init_uprobes); diff --git a/kernel/exit.c b/kernel/exit.c index 2639a30a8aa5..2166c2d92ddc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -219,6 +219,7 @@ repeat: } write_unlock_irq(&tasklist_lock); + cgroup_release(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 17f75b545f66..feb80712b913 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v) { struct fei_attr *attr = list_entry(v, struct fei_attr, list); - seq_printf(m, "%pf\n", attr->kp.addr); + seq_printf(m, "%ps\n", attr->kp.addr); return 0; } diff --git a/kernel/fork.c b/kernel/fork.c index b69248e6f0e0..8b03d93ba068 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -11,6 +11,7 @@ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ +#include <linux/anon_inodes.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> @@ -21,6 +22,7 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/seq_file.h> #include <linux/rtmutex.h> #include <linux/init.h> #include <linux/unistd.h> @@ -77,7 +79,6 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> -#include <linux/sched/mm.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> @@ -429,7 +430,7 @@ static void release_task_stack(struct task_struct *tsk) #ifdef CONFIG_THREAD_INFO_IN_TASK void put_task_stack(struct task_struct *tsk) { - if (atomic_dec_and_test(&tsk->stack_refcount)) + if (refcount_dec_and_test(&tsk->stack_refcount)) release_task_stack(tsk); } #endif @@ -447,7 +448,7 @@ void free_task(struct task_struct *tsk) * If the task had a separate stack allocation, it should be gone * by now. */ - WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); + WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -710,14 +711,14 @@ static inline void free_signal_struct(struct signal_struct *sig) static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) + if (refcount_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); } void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); - WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); cgroup_free(tsk); @@ -816,6 +817,7 @@ void __init fork_init(void) #endif lockdep_init_task(&init_task); + uprobes_init(); } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -867,7 +869,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->stack_vm_area = stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK - atomic_set(&tsk->stack_refcount, 1); + refcount_set(&tsk->stack_refcount, 1); #endif if (err) @@ -896,7 +898,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) * One for us, one for whoever does the "release_task()" (usually * parent) */ - atomic_set(&tsk->usage, 2); + refcount_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif @@ -981,7 +983,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; - mm->pinned_vm = 0; + atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); @@ -1299,13 +1301,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) complete_vfork_done(tsk); } -/* - * Allocate a new mm structure and copy contents from the - * mm structure of the passed in task structure. +/** + * dup_mm() - duplicates an existing mm structure + * @tsk: the task_struct with which the new mm will be associated. + * @oldmm: the mm to duplicate. + * + * Allocates a new mm structure and duplicates the provided @oldmm structure + * content into it. + * + * Return: the duplicated mm or NULL on failure. */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +static struct mm_struct *dup_mm(struct task_struct *tsk, + struct mm_struct *oldmm) { - struct mm_struct *mm, *oldmm = current->mm; + struct mm_struct *mm; int err; mm = allocate_mm(); @@ -1372,7 +1381,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) } retval = -ENOMEM; - mm = dup_mm(tsk); + mm = dup_mm(tsk, current->mm); if (!mm) goto fail_nomem; @@ -1463,7 +1472,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) struct sighand_struct *sig; if (clone_flags & CLONE_SIGHAND) { - atomic_inc(¤t->sighand->count); + refcount_inc(¤t->sighand->count); return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); @@ -1471,7 +1480,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; - atomic_set(&sig->count, 1); + refcount_set(&sig->count, 1); spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(¤t->sighand->siglock); @@ -1480,7 +1489,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) { + if (refcount_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it @@ -1527,7 +1536,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); - atomic_set(&sig->sigcnt, 1); + refcount_set(&sig->sigcnt, 1); /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); @@ -1663,6 +1672,58 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ } +static int pidfd_release(struct inode *inode, struct file *file) +{ + struct pid *pid = file->private_data; + + file->private_data = NULL; + put_pid(pid); + return 0; +} + +#ifdef CONFIG_PROC_FS +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); + struct pid *pid = f->private_data; + + seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); + seq_putc(m, '\n'); +} +#endif + +const struct file_operations pidfd_fops = { + .release = pidfd_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1675,13 +1736,14 @@ static __latent_entropy struct task_struct *copy_process( unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, + int __user *parent_tidptr, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls, int node) { - int retval; + int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; @@ -1731,6 +1793,31 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_PIDFD) { + int reserved; + + /* + * - CLONE_PARENT_SETTID is useless for pidfds and also + * parent_tidptr is used to return pidfds. + * - CLONE_DETACHED is blocked so that we can potentially + * reuse it later for CLONE_PIDFD. + * - CLONE_THREAD is blocked until someone really needs it. + */ + if (clone_flags & + (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) + return ERR_PTR(-EINVAL); + + /* + * Verify that parent_tidptr is sane so we can potentially + * reuse it later. + */ + if (get_user(reserved, parent_tidptr)) + return ERR_PTR(-EFAULT); + + if (reserved != 0) + return ERR_PTR(-EINVAL); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -1937,6 +2024,22 @@ static __latent_entropy struct task_struct *copy_process( } } + /* + * This has to happen after we've potentially unshared the file + * descriptor table (so that the pidfd doesn't leak into the child + * if the fd table isn't shared). + */ + if (clone_flags & CLONE_PIDFD) { + retval = pidfd_create(pid); + if (retval < 0) + goto bad_fork_free_pid; + + pidfd = retval; + retval = put_user(pidfd, parent_tidptr); + if (retval) + goto bad_fork_put_pidfd; + } + #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1997,7 +2100,7 @@ static __latent_entropy struct task_struct *copy_process( */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_free_pid; + goto bad_fork_put_pidfd; /* * From this point on we must avoid any synchronous user-space @@ -2082,7 +2185,7 @@ static __latent_entropy struct task_struct *copy_process( } else { current->signal->nr_threads++; atomic_inc(¤t->signal->live); - atomic_inc(¤t->signal->sigcnt); + refcount_inc(¤t->signal->sigcnt); task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); @@ -2112,6 +2215,9 @@ bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); +bad_fork_put_pidfd: + if (clone_flags & CLONE_PIDFD) + ksys_close(pidfd); bad_fork_free_pid: cgroup_threadgroup_change_end(current); if (pid != &init_struct_pid) @@ -2177,7 +2283,7 @@ static inline void init_idle_pids(struct task_struct *idle) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, + task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, cpu_to_node(cpu)); if (!IS_ERR(task)) { init_idle_pids(task); @@ -2187,6 +2293,11 @@ struct task_struct *fork_idle(int cpu) return task; } +struct mm_struct *copy_init_mm(void) +{ + return dup_mm(NULL, &init_mm); +} + /* * Ok, this is the main fork-routine. * @@ -2224,7 +2335,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, + p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); add_latent_entropy(); @@ -2439,7 +2550,7 @@ static int check_unshare_flags(unsigned long unshare_flags) return -EINVAL; } if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { - if (atomic_read(¤t->sighand->count) > 1) + if (refcount_read(¤t->sighand->count) > 1) return -EINVAL; } if (unshare_flags & CLONE_VM) { diff --git a/kernel/futex.c b/kernel/futex.c index a0514e01c3eb..6262f1534ac9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -68,6 +68,7 @@ #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> +#include <linux/refcount.h> #include <asm/futex.h> @@ -212,7 +213,7 @@ struct futex_pi_state { struct rt_mutex pi_mutex; struct task_struct *owner; - atomic_t refcount; + refcount_t refcount; union futex_key key; } __randomize_layout; @@ -321,12 +322,8 @@ static int __init fail_futex_debugfs(void) if (IS_ERR(dir)) return PTR_ERR(dir); - if (!debugfs_create_bool("ignore-private", mode, dir, - &fail_futex.ignore_private)) { - debugfs_remove_recursive(dir); - return -ENOMEM; - } - + debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private); return 0; } @@ -803,7 +800,7 @@ static int refill_pi_state_cache(void) INIT_LIST_HEAD(&pi_state->list); /* pi_mutex gets initialized later */ pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); + refcount_set(&pi_state->refcount, 1); pi_state->key = FUTEX_KEY_INIT; current->pi_state_cache = pi_state; @@ -823,7 +820,7 @@ static struct futex_pi_state *alloc_pi_state(void) static void get_pi_state(struct futex_pi_state *pi_state) { - WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); + WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); } /* @@ -835,7 +832,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) if (!pi_state) return; - if (!atomic_dec_and_test(&pi_state->refcount)) + if (!refcount_dec_and_test(&pi_state->refcount)) return; /* @@ -865,7 +862,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * refcount is at 0 - put it back to 1. */ pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); + refcount_set(&pi_state->refcount, 1); current->pi_state_cache = pi_state; } } @@ -908,7 +905,7 @@ void exit_pi_state_list(struct task_struct *curr) * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ - if (!atomic_inc_not_zero(&pi_state->refcount)) { + if (!refcount_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); @@ -1064,7 +1061,7 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval, * and futex_wait_requeue_pi() as it cannot go to 0 and consequently * free pi_state before we can take a reference ourselves. */ - WARN_ON(!atomic_read(&pi_state->refcount)); + WARN_ON(!refcount_read(&pi_state->refcount)); /* * Now that we have a pi_state, we can acquire wait_lock @@ -1314,13 +1311,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { + int err; u32 uninitialized_var(curval); if (unlikely(should_fail_futex(true))) return -EFAULT; - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) - return -EFAULT; + err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (unlikely(err)) + return err; /* If user space value changed, let the caller retry */ return curval != uval ? -EAGAIN : 0; @@ -1467,8 +1466,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) * Queue the task for later wakeup for after we've released * the hb->lock. wake_q_add() grabs reference to p. */ - wake_q_add(wake_q, p); - put_task_struct(p); + wake_q_add_safe(wake_q, p); } /* @@ -1506,10 +1504,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ if (unlikely(should_fail_futex(true))) ret = -EFAULT; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { - ret = -EFAULT; - - } else if (curval != uval) { + ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (!ret && (curval != uval)) { /* * If a unconditional UNLOCK_PI operation (user space did not * try the TID->0 transition) raced with a waiter setting the @@ -1704,32 +1700,32 @@ retry_private: double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - double_unlock_hb(hb1, hb2); -#ifndef CONFIG_MMU - /* - * we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking - */ - ret = op_ret; - goto out_put_keys; -#endif - - if (unlikely(op_ret != -EFAULT)) { + if (!IS_ENABLED(CONFIG_MMU) || + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { + /* + * we don't get EFAULT from MMU faults if we don't have + * an MMU, but we might get them from range checking + */ ret = op_ret; goto out_put_keys; } - ret = fault_in_user_writeable(uaddr2); - if (ret) - goto out_put_keys; + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) + goto out_put_keys; + } - if (!(flags & FLAGS_SHARED)) + if (!(flags & FLAGS_SHARED)) { + cond_resched(); goto retry_private; + } put_futex_key(&key2); put_futex_key(&key1); + cond_resched(); goto retry; } @@ -2354,7 +2350,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, u32 uval, uninitialized_var(curval), newval; struct task_struct *oldowner, *newowner; u32 newtid; - int ret; + int ret, err = 0; lockdep_assert_held(q->lock_ptr); @@ -2425,14 +2421,17 @@ retry: if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; - if (get_futex_value_locked(&uval, uaddr)) - goto handle_fault; + err = get_futex_value_locked(&uval, uaddr); + if (err) + goto handle_err; for (;;) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - goto handle_fault; + err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (err) + goto handle_err; + if (curval == uval) break; uval = curval; @@ -2460,23 +2459,37 @@ retry: return 0; /* - * To handle the page fault we need to drop the locks here. That gives - * the other task (either the highest priority waiter itself or the - * task which stole the rtmutex) the chance to try the fixup of the - * pi_state. So once we are back from handling the fault we need to - * check the pi_state after reacquiring the locks and before trying to - * do another fixup. When the fixup has been done already we simply - * return. + * In order to reschedule or handle a page fault, we need to drop the + * locks here. In the case of a fault, this gives the other task + * (either the highest priority waiter itself or the task which stole + * the rtmutex) the chance to try the fixup of the pi_state. So once we + * are back from handling the fault we need to check the pi_state after + * reacquiring the locks and before trying to do another fixup. When + * the fixup has been done already we simply return. * * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely * drop hb->lock since the caller owns the hb -> futex_q relation. * Dropping the pi_mutex->wait_lock requires the state revalidate. */ -handle_fault: +handle_err: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(q->lock_ptr); - ret = fault_in_user_writeable(uaddr); + switch (err) { + case -EFAULT: + ret = fault_in_user_writeable(uaddr); + break; + + case -EAGAIN: + cond_resched(); + ret = 0; + break; + + default: + WARN_ON_ONCE(1); + ret = err; + break; + } spin_lock(q->lock_ptr); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); @@ -3045,10 +3058,8 @@ retry: * A unconditional UNLOCK_PI op raced against a waiter * setting the FUTEX_WAITERS bit. Try again. */ - if (ret == -EAGAIN) { - put_futex_key(&key); - goto retry; - } + if (ret == -EAGAIN) + goto pi_retry; /* * wake_futex_pi has detected invalid state. Tell user * space. @@ -3063,9 +3074,19 @@ retry: * preserve the WAITERS bit not the OWNER_DIED one. We are the * owner. */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { spin_unlock(&hb->lock); - goto pi_faulted; + switch (ret) { + case -EFAULT: + goto pi_faulted; + + case -EAGAIN: + goto pi_retry; + + default: + WARN_ON_ONCE(1); + goto out_putkey; + } } /* @@ -3079,6 +3100,11 @@ out_putkey: put_futex_key(&key); return ret; +pi_retry: + put_futex_key(&key); + cond_resched(); + goto retry; + pi_faulted: put_futex_key(&key); @@ -3439,47 +3465,67 @@ err_unlock: static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { u32 uval, uninitialized_var(nval), mval; + int err; + + /* Futex address must be 32bit aligned */ + if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) + return -1; retry: if (get_user(uval, uaddr)) return -1; - if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { - /* - * Ok, this dying thread is truly holding a futex - * of interest. Set the OWNER_DIED bit atomically - * via cmpxchg, and if the value had FUTEX_WAITERS - * set, wake up a waiter (if any). (We have to do a - * futex_wake() even if OWNER_DIED is already set - - * to handle the rare but possible case of recursive - * thread-death.) The rest of the cleanup is done in - * userspace. - */ - mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - /* - * We are not holding a lock here, but we want to have - * the pagefault_disable/enable() protection because - * we want to handle the fault gracefully. If the - * access fails we try to fault in the futex with R/W - * verification via get_user_pages. get_user() above - * does not guarantee R/W access. If that fails we - * give up and leave the futex locked. - */ - if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + return 0; + + /* + * Ok, this dying thread is truly holding a futex + * of interest. Set the OWNER_DIED bit atomically + * via cmpxchg, and if the value had FUTEX_WAITERS + * set, wake up a waiter (if any). (We have to do a + * futex_wake() even if OWNER_DIED is already set - + * to handle the rare but possible case of recursive + * thread-death.) The rest of the cleanup is done in + * userspace. + */ + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { + switch (err) { + case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; - } - if (nval != uval) + + case -EAGAIN: + cond_resched(); goto retry; - /* - * Wake robust non-PI futexes here. The wakeup of - * PI futexes happens in exit_pi_state(): - */ - if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + default: + WARN_ON_ONCE(1); + return err; + } } + + if (nval != uval) + goto retry; + + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi && (uval & FUTEX_WAITERS)) + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + return 0; } @@ -3823,7 +3869,7 @@ err_unlock: #endif /* CONFIG_COMPAT */ #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, +SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 1e32e66c9563..2dddecbdbe6e 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -245,8 +245,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info) /* Duplicate gcov_info. */ active = num_counter_active(info); - dup = kzalloc(sizeof(struct gcov_info) + - sizeof(struct gcov_ctr_info) * active, GFP_KERNEL); + dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL); if (!dup) return NULL; dup->version = info->version; @@ -364,8 +363,7 @@ struct gcov_iterator *gcov_iter_new(struct gcov_info *info) { struct gcov_iterator *iter; - iter = kzalloc(sizeof(struct gcov_iterator) + - num_counter_active(info) * sizeof(struct type_info), + iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)), GFP_KERNEL); if (iter) iter->info = info; diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh new file mode 100755 index 000000000000..591a94f7b387 --- /dev/null +++ b/kernel/gen_ikh_data.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This script generates an archive consisting of kernel headers +# for CONFIG_IKHEADERS_PROC. +set -e +spath="$(dirname "$(readlink -f "$0")")" +kroot="$spath/.." +outdir="$(pwd)" +tarfile=$1 +cpio_dir=$outdir/$tarfile.tmp + +# Script filename relative to the kernel source root +# We add it to the archive because it is small and any changes +# to this script will also cause a rebuild of the archive. +sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" + +src_file_list=" +include/ +arch/$SRCARCH/include/ +$sfile +" + +obj_file_list=" +include/ +arch/$SRCARCH/include/ +" + +# Support incremental builds by skipping archive generation +# if timestamps of files being archived are not changed. + +# This block is useful for debugging the incremental builds. +# Uncomment it for debugging. +# iter=1 +# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; +# else; iter=$(($(cat /tmp/iter) + 1)); fi +# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter +# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter + +# include/generated/compile.h is ignored because it is touched even when none +# of the source files changed. This causes pointless regeneration, so let us +# ignore them for md5 calculation. +pushd $kroot > /dev/null +src_files_md5="$(find $src_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" +popd > /dev/null +obj_files_md5="$(find $obj_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" + +if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi +if [ -f kernel/kheaders.md5 ] && + [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && + [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + exit +fi + +if [ "${quiet}" != "silent_" ]; then + echo " GEN $tarfile" +fi + +rm -rf $cpio_dir +mkdir $cpio_dir + +pushd $kroot > /dev/null +for f in $src_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir +popd > /dev/null + +# The second CPIO can complain if files already exist which can +# happen with out of tree builds. Just silence CPIO for now. +for f in $obj_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 + +# Remove comments except SDPX lines +find $cpio_dir -type f -print0 | + xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' + +tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null + +echo "$src_files_md5" > kernel/kheaders.md5 +echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + +rm -rf $cpio_dir diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 4a9191617076..f108a95882c6 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -19,6 +19,7 @@ #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/sched/sysctl.h> #include <trace/events/sched.h> @@ -126,7 +127,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", - t->comm, t->pid, timeout); + t->comm, t->pid, (jiffies - t->last_switch_time) / HZ); pr_err(" %s %s %.*s\n", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), diff --git a/kernel/iomem.c b/kernel/iomem.c index f7525e14ebc6..93c264444510 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size, * * MEMREMAP_WB - matches the default mapping for System RAM on * the architecture. This is usually a read-allocate write-back cache. - * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM * memremap() will bypass establishing a new mapping and instead return * a pointer into the direct map. * @@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) /* Try all mapping types requested until one returns non-NULL */ if (flags & MEMREMAP_WB) { /* - * MEMREMAP_WB is special in that it can be satisifed + * MEMREMAP_WB is special in that it can be satisfied * from the direct map. Some archs depend on the * capability of memremap() to autodetect cases where * the requested range is potentially in System RAM. diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 45b68b4ea48b..f18cd5aa33e8 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -9,7 +9,7 @@ #include <linux/cpu.h> static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, - int cpus_per_vec) + unsigned int cpus_per_vec) { const struct cpumask *siblmsk; int cpu, sibl; @@ -95,15 +95,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, } static int __irq_build_affinity_masks(const struct irq_affinity *affd, - int startvec, int numvecs, int firstvec, + unsigned int startvec, + unsigned int numvecs, + unsigned int firstvec, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct irq_affinity_desc *masks) { - int n, nodes, cpus_per_vec, extra_vecs, done = 0; - int last_affv = firstvec + numvecs; - int curvec = startvec; + unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0; + unsigned int last_affv = firstvec + numvecs; + unsigned int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; if (!cpumask_weight(cpu_mask)) @@ -117,18 +119,16 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, */ if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_or(&masks[curvec].mask, - &masks[curvec].mask, - node_to_cpumask[n]); + cpumask_or(&masks[curvec].mask, &masks[curvec].mask, + node_to_cpumask[n]); if (++curvec == last_affv) curvec = firstvec; } - done = numvecs; - goto out; + return numvecs; } for_each_node_mask(n, nodemsk) { - int ncpus, v, vecs_to_assign, vecs_per_node; + unsigned int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; @@ -163,8 +163,6 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, curvec = firstvec; --nodes; } - -out: return done; } @@ -174,19 +172,24 @@ out: * 2) spread other possible CPUs on these vectors */ static int irq_build_affinity_masks(const struct irq_affinity *affd, - int startvec, int numvecs, int firstvec, - cpumask_var_t *node_to_cpumask, + unsigned int startvec, unsigned int numvecs, + unsigned int firstvec, struct irq_affinity_desc *masks) { - int curvec = startvec, nr_present, nr_others; - int ret = -ENOMEM; + unsigned int curvec = startvec, nr_present, nr_others; + cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return ret; if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) - goto fail; + goto fail_nmsk; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto fail_npresmsk; ret = 0; /* Stabilize the cpumasks */ @@ -217,13 +220,22 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, if (nr_present < numvecs) WARN_ON(nr_present + nr_others < numvecs); + free_node_to_cpumask(node_to_cpumask); + + fail_npresmsk: free_cpumask_var(npresmsk); - fail: + fail_nmsk: free_cpumask_var(nmsk); return ret; } +static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) +{ + affd->nr_sets = 1; + affd->set_size[0] = affvecs; +} + /** * irq_create_affinity_masks - Create affinity masks for multiqueue spreading * @nvecs: The total number of vectors @@ -232,50 +244,62 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, * Returns the irq_affinity_desc pointer or NULL if allocation failed. */ struct irq_affinity_desc * -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) { - int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; - int curvec, usedvecs; - cpumask_var_t *node_to_cpumask; + unsigned int affvecs, curvec, usedvecs, i; struct irq_affinity_desc *masks = NULL; - int i, nr_sets; /* - * If there aren't any vectors left after applying the pre/post - * vectors don't bother with assigning affinity. + * Determine the number of vectors which need interrupt affinities + * assigned. If the pre/post request exhausts the available vectors + * then nothing to do here except for invoking the calc_sets() + * callback so the device driver can adjust to the situation. If there + * is only a single vector, then managing the queue is pointless as + * well. */ - if (nvecs == affd->pre_vectors + affd->post_vectors) + if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors) + affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + else + affvecs = 0; + + /* + * Simple invocations do not provide a calc_sets() callback. Install + * the generic one. + */ + if (!affd->calc_sets) + affd->calc_sets = default_calc_sets; + + /* Recalculate the sets */ + affd->calc_sets(affd, affvecs); + + if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS)) return NULL; - node_to_cpumask = alloc_node_to_cpumask(); - if (!node_to_cpumask) + /* Nothing to assign? */ + if (!affvecs) return NULL; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); if (!masks) - goto outnodemsk; + return NULL; /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) cpumask_copy(&masks[curvec].mask, irq_default_affinity); + /* * Spread on present CPUs starting from affd->pre_vectors. If we * have multiple sets, build each sets affinity mask separately. */ - nr_sets = affd->nr_sets; - if (!nr_sets) - nr_sets = 1; - - for (i = 0, usedvecs = 0; i < nr_sets; i++) { - int this_vecs = affd->sets ? affd->sets[i] : affvecs; + for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { + unsigned int this_vecs = affd->set_size[i]; int ret; ret = irq_build_affinity_masks(affd, curvec, this_vecs, - curvec, node_to_cpumask, masks); + curvec, masks); if (ret) { kfree(masks); - masks = NULL; - goto outnodemsk; + return NULL; } curvec += this_vecs; usedvecs += this_vecs; @@ -293,8 +317,6 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++) masks[i].is_managed = 1; -outnodemsk: - free_node_to_cpumask(node_to_cpumask); return masks; } @@ -304,25 +326,22 @@ outnodemsk: * @maxvec: The maximum number of vectors available * @affd: Description of the affinity requirements */ -int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) +unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, + const struct irq_affinity *affd) { - int resv = affd->pre_vectors + affd->post_vectors; - int vecs = maxvec - resv; - int set_vecs; + unsigned int resv = affd->pre_vectors + affd->post_vectors; + unsigned int set_vecs; if (resv > minvec) return 0; - if (affd->nr_sets) { - int i; - - for (i = 0, set_vecs = 0; i < affd->nr_sets; i++) - set_vecs += affd->sets[i]; + if (affd->calc_sets) { + set_vecs = maxvec - resv; } else { get_online_cpus(); set_vecs = cpumask_weight(cpu_possible_mask); put_online_cpus(); } - return resv + min(set_vecs, vecs); + return resv + min(set_vecs, maxvec - resv); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 34e969069488..51128bea3846 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -730,6 +730,37 @@ out: EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** + * handle_fasteoi_nmi - irq handler for NMI interrupt lines + * @desc: the interrupt description structure for this irq + * + * A simple NMI-safe handler, considering the restrictions + * from request_nmi. + * + * Only a single callback will be issued to the chip: an ->eoi() + * call when the interrupt has been serviced. This enables support + * for modern forms of interrupt handlers, which handle the flow + * details in hardware, transparently. + */ +void handle_fasteoi_nmi(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + unsigned int irq = irq_desc_get_irq(desc); + irqreturn_t res; + + trace_irq_handler_entry(irq, action); + /* + * NMIs cannot be shared, there is only one action. + */ + res = action->handler(irq, action->dev_id); + trace_irq_handler_exit(irq, action, res); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); +} +EXPORT_SYMBOL_GPL(handle_fasteoi_nmi); + +/** * handle_edge_irq - edge type IRQ handler * @desc: the interrupt description structure for this irq * @@ -855,7 +886,11 @@ void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - kstat_incr_irqs_this_cpu(desc); + /* + * PER CPU interrupts are not serialized. Do not touch + * desc->tot_count. + */ + __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -884,7 +919,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc) unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; - kstat_incr_irqs_this_cpu(desc); + /* + * PER CPU interrupts are not serialized. Do not touch + * desc->tot_count. + */ + __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -908,6 +947,29 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } +/** + * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu + * dev ids + * @desc: the interrupt description structure for this irq + * + * Similar to handle_fasteoi_nmi, but handling the dev_id cookie + * as a percpu pointer. + */ +void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + unsigned int irq = irq_desc_get_irq(desc); + irqreturn_t res; + + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); +} + static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) @@ -1278,6 +1340,17 @@ void irq_chip_mask_parent(struct irq_data *data) EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** + * irq_chip_mask_ack_parent - Mask and acknowledge the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_mask_ack_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_mask_ack(data); +} +EXPORT_SYMBOL_GPL(irq_chip_mask_ack_parent); + +/** * irq_chip_unmask_parent - Unmask the parent interrupt * @data: Pointer to interrupt specific data */ @@ -1376,11 +1449,16 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) { data = data->parent_data; + + if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE) + return 0; + if (data->chip->irq_set_wake) return data->chip->irq_set_wake(data, on); return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); #endif /** diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 6f636136cccc..c1eccd4f6520 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -56,6 +56,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE), BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), + BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), }; static void @@ -140,6 +141,7 @@ static const struct irq_bit_descr irqdesc_istates[] = { BIT_MASK_DESCR(IRQS_WAITING), BIT_MASK_DESCR(IRQS_PENDING), BIT_MASK_DESCR(IRQS_SUSPENDED), + BIT_MASK_DESCR(IRQS_NMI), }; @@ -150,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p) raw_spin_lock_irq(&desc->lock); data = irq_desc_get_irq_data(desc); - seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "handler: %ps\n", desc->handle_irq); seq_printf(m, "device: %s\n", desc->dev_name); seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, @@ -203,8 +205,8 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf, chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_settings_is_level(desc)) { - /* Can't do level, sorry */ + if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) { + /* Can't do level nor NMIs, sorry */ err = -EINVAL; } else { desc->istate |= IRQS_PENDING; @@ -256,8 +258,6 @@ static int __init irq_debugfs_init(void) int irq; root_dir = debugfs_create_dir("irq", NULL); - if (!root_dir) - return -ENOMEM; irq_domain_debugfs_init(root_dir); diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 5d5378ea0afe..f6e5515ee077 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -84,8 +84,6 @@ EXPORT_SYMBOL(devm_request_threaded_irq); * @dev: device to request interrupt for * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs - * @thread_fn: function to be called in a threaded interrupt context. NULL - * for devices which handle everything in @handler * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device, dev_name(dev) if NULL * @dev_id: A cookie passed back to the handler function @@ -222,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, irq_flow_handler_t handler) { struct irq_chip_generic *gc; - unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); - gc = devm_kzalloc(dev, sz, GFP_KERNEL); + gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL); if (gc) irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, handler); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 38554bc35375..a4ace611f47f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); - if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", irq, action->handler)) local_irq_disable(); @@ -166,7 +166,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags __irq_wake_thread(desc, action); - /* Fall through to add to randomness */ + /* Fall through - to add to randomness */ case IRQ_HANDLED: *flags |= action->flags; break; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ca6afa267070..70c3053bc1f6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -49,6 +49,7 @@ enum { * IRQS_WAITING - irq is waiting * IRQS_PENDING - irq is pending and replayed later * IRQS_SUSPENDED - irq is suspended + * IRQS_NMI - irq line is used to deliver NMIs */ enum { IRQS_AUTODETECT = 0x00000001, @@ -60,6 +61,7 @@ enum { IRQS_PENDING = 0x00000200, IRQS_SUSPENDED = 0x00000800, IRQS_TIMINGS = 0x00001000, + IRQS_NMI = 0x00002000, }; #include "debug.h" @@ -242,12 +244,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc) #undef __irqd_to_state -static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } +static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +{ + __kstat_incr_irqs_this_cpu(desc); + desc->tot_count++; +} + static inline int irq_desc_get_node(struct irq_desc *desc) { return irq_common_data_get_node(&desc->irq_common_data); diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 98a20e1594ce..b992f88c5613 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -25,10 +25,22 @@ static void irq_sim_irqunmask(struct irq_data *data) irq_ctx->enabled = true; } +static int irq_sim_set_type(struct irq_data *data, unsigned int type) +{ + /* We only support rising and falling edge trigger types. */ + if (type & ~IRQ_TYPE_EDGE_BOTH) + return -EINVAL; + + irqd_set_trigger_type(data, type); + + return 0; +} + static struct irq_chip irq_sim_irqchip = { .name = "irq_sim", .irq_mask = irq_sim_irqmask, .irq_unmask = irq_sim_irqunmask, + .irq_set_type = irq_sim_set_type, }; static void irq_sim_handle_irq(struct irq_work *work) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index ef8ad36cadcf..c52b737ab8e3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->depth = 1; desc->irq_count = 0; desc->irqs_unhandled = 0; + desc->tot_count = 0; desc->name = NULL; desc->owner = owner; for_each_possible_cpu(cpu) @@ -274,11 +275,12 @@ static struct attribute *irq_attrs[] = { &actions_attr.attr, NULL }; +ATTRIBUTE_GROUPS(irq); static struct kobj_type irq_kobj_type = { .release = irq_kobj_release, .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = irq_attrs, + .default_groups = irq_groups, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) @@ -557,6 +559,7 @@ int __init early_irq_init(void) alloc_masks(&desc[i], node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + mutex_init(&desc[i].request_mutex); desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); @@ -669,6 +672,41 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, set_irq_regs(old_regs); return ret; } + +#ifdef CONFIG_IRQ_DOMAIN +/** + * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain + * @domain: The domain where to perform the lookup + * @hwirq: The HW irq number to convert to a logical one + * @regs: Register file coming from the low-level handling code + * + * Returns: 0 on success, or -EINVAL if conversion has failed + */ +int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, + struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + unsigned int irq; + int ret = 0; + + nmi_enter(); + + irq = irq_find_mapping(domain, hwirq); + + /* + * ack_bad_irq is not NMI-safe, just report + * an invalid interrupt. + */ + if (likely(irq)) + generic_handle_irq(irq); + else + ret = -EINVAL; + + nmi_exit(); + set_irq_regs(old_regs); + return ret; +} +#endif #endif /* Dynamic interrupt handling */ @@ -919,11 +957,15 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - int cpu; unsigned int sum = 0; + int cpu; if (!desc || !desc->kstat_irqs) return 0; + if (!irq_settings_is_per_cpu_devid(desc) && + !irq_settings_is_per_cpu(desc)) + return desc->tot_count; + for_each_possible_cpu(cpu) sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8b0be4bd6565..9ed29e4a7dbf 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -458,6 +458,20 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); +/** + * irq_get_default_host() - Retrieve the "default" irq domain + * + * Returns: the default domain, if any. + * + * Modern code should never use this. This should only be used on + * systems that cannot implement a firmware->fwnode mapping (which + * both DT and ACPI provide). + */ +struct irq_domain *irq_get_default_host(void) +{ + return irq_default_domain; +} + static void irq_domain_clear_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { @@ -729,16 +743,17 @@ static int irq_domain_translate(struct irq_domain *d, return 0; } -static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, +static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, + unsigned int count, struct irq_fwspec *fwspec) { int i; - fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL; - fwspec->param_count = irq_data->args_count; + fwspec->fwnode = np ? &np->fwnode : NULL; + fwspec->param_count = count; - for (i = 0; i < irq_data->args_count; i++) - fwspec->param[i] = irq_data->args[i]; + for (i = 0; i < count; i++) + fwspec->param[i] = args[i]; } unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) @@ -836,7 +851,9 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) { struct irq_fwspec fwspec; - of_phandle_args_to_fwspec(irq_data, &fwspec); + of_phandle_args_to_fwspec(irq_data->np, irq_data->args, + irq_data->args_count, &fwspec); + return irq_create_fwspec_mapping(&fwspec); } EXPORT_SYMBOL_GPL(irq_create_of_mapping); @@ -928,11 +945,10 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) { - if (WARN_ON(intsize < 2)) - return -EINVAL; - *out_hwirq = intspec[0]; - *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; - return 0; + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); + return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type); } EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); @@ -968,6 +984,27 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); +/** + * irq_domain_translate_twocell() - Generic translate for direct two cell + * bindings + * + * Device Tree IRQ specifier translation function which works with two cell + * bindings where the cell values map directly to the hwirq number + * and linux irq flags. + */ +int irq_domain_translate_twocell(struct irq_domain *d, + struct irq_fwspec *fwspec, + unsigned long *out_hwirq, + unsigned int *out_type) +{ + if (WARN_ON(fwspec->param_count < 2)) + return -EINVAL; + *out_hwirq = fwspec->param[0]; + *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); + int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { @@ -1749,8 +1786,6 @@ void __init irq_domain_debugfs_init(struct dentry *root) struct irq_domain *d; domain_dir = debugfs_create_dir("domains", root); - if (!domain_dir) - return; debugfs_create_file("default", 0444, domain_dir, NULL, &irq_domain_debug_fops); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 84b54a17b95d..78f3ddeb7fe4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -196,6 +196,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); + /* fall through */ case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); @@ -341,7 +342,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) /* The release function is promised process context */ might_sleep(); - if (!desc) + if (!desc || desc->istate & IRQS_NMI) return -EINVAL; /* Complete initialisation of *notify */ @@ -356,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); - if (old_notify) + if (old_notify) { + cancel_work_sync(&old_notify->work); kref_put(&old_notify->kref, old_notify->release); + } return 0; } @@ -553,6 +556,21 @@ bool disable_hardirq(unsigned int irq) } EXPORT_SYMBOL_GPL(disable_hardirq); +/** + * disable_nmi_nosync - disable an nmi without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables and enables are + * nested. + * The interrupt to disable must have been requested through request_nmi. + * Unlike disable_nmi(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. + */ +void disable_nmi_nosync(unsigned int irq) +{ + disable_irq_nosync(irq); +} + void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { @@ -609,6 +627,20 @@ out: } EXPORT_SYMBOL(enable_irq); +/** + * enable_nmi - enable handling of an nmi + * @irq: Interrupt to enable + * + * The interrupt to enable must have been requested through request_nmi. + * Undoes the effect of one call to disable_nmi(). If this + * matches the last disable, processing of interrupts on this + * IRQ line is re-enabled. + */ +void enable_nmi(unsigned int irq) +{ + enable_irq(irq); +} + static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); @@ -644,6 +676,12 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (!desc) return -EINVAL; + /* Don't use NMIs as wake up interrupts please */ + if (desc->istate & IRQS_NMI) { + ret = -EINVAL; + goto out_unlock; + } + /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ @@ -666,6 +704,8 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } + +out_unlock: irq_put_desc_busunlock(desc, flags); return ret; } @@ -726,6 +766,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); + /* fall through */ case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); @@ -740,7 +781,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) ret = 0; break; default: - pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) @@ -1128,6 +1169,39 @@ static void irq_release_resources(struct irq_desc *desc) c->irq_release_resources(d); } +static bool irq_supports_nmi(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + /* Only IRQs directly managed by the root irqchip can be set as NMI */ + if (d->parent_data) + return false; +#endif + /* Don't support NMIs for chips behind a slow bus */ + if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock) + return false; + + return d->chip->flags & IRQCHIP_SUPPORTS_NMI; +} + +static int irq_nmi_setup(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_chip *c = d->chip; + + return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL; +} + +static void irq_nmi_teardown(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_chip *c = d->chip; + + if (c->irq_nmi_teardown) + c->irq_nmi_teardown(d); +} + static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { @@ -1302,9 +1376,17 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * fields must have IRQF_SHARED set and the bits which * set the trigger type must match. Also all must * agree on ONESHOT. + * Interrupt lines used for NMIs cannot be shared. */ unsigned int oldtype; + if (desc->istate & IRQS_NMI) { + pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", + new->name, irq, desc->irq_data.chip->name); + ret = -EINVAL; + goto out_unlock; + } + /* * If nobody did set the configuration before, inherit * the one provided by the requester. @@ -1756,6 +1838,59 @@ const void *free_irq(unsigned int irq, void *dev_id) } EXPORT_SYMBOL(free_irq); +/* This function must be called with desc->lock held */ +static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) +{ + const char *devname = NULL; + + desc->istate &= ~IRQS_NMI; + + if (!WARN_ON(desc->action == NULL)) { + irq_pm_remove_action(desc, desc->action); + devname = desc->action->name; + unregister_handler_proc(irq, desc->action); + + kfree(desc->action); + desc->action = NULL; + } + + irq_settings_clr_disable_unlazy(desc); + irq_shutdown(desc); + + irq_release_resources(desc); + + irq_chip_pm_put(&desc->irq_data); + module_put(desc->owner); + + return devname; +} + +const void *free_nmi(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + const void *devname; + + if (!desc || WARN_ON(!(desc->istate & IRQS_NMI))) + return NULL; + + if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) + return NULL; + + /* NMI still enabled */ + if (WARN_ON(desc->depth == 0)) + disable_nmi_nosync(irq); + + raw_spin_lock_irqsave(&desc->lock, flags); + + irq_nmi_teardown(desc); + devname = __cleanup_nmi(irq, desc); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return devname; +} + /** * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate @@ -1925,6 +2060,101 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, } EXPORT_SYMBOL_GPL(request_any_context_irq); +/** + * request_nmi - allocate an interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @irqflags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. It sets up the IRQ line + * to be handled as an NMI. + * + * An interrupt line delivering NMIs cannot be shared and IRQ handling + * cannot be threaded. + * + * Interrupt lines requested for NMI delivering must produce per cpu + * interrupts and have auto enabling setting disabled. + * + * Dev_id must be globally unique. Normally the address of the + * device data structure is used as the cookie. Since the handler + * receives this value it makes sense to use it. + * + * If the interrupt line cannot be used to deliver NMIs, function + * will fail and return a negative value. + */ +int request_nmi(unsigned int irq, irq_handler_t handler, + unsigned long irqflags, const char *name, void *dev_id) +{ + struct irqaction *action; + struct irq_desc *desc; + unsigned long flags; + int retval; + + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + /* NMI cannot be shared, used for Polling */ + if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL)) + return -EINVAL; + + if (!(irqflags & IRQF_PERCPU)) + return -EINVAL; + + if (!handler) + return -EINVAL; + + desc = irq_to_desc(irq); + + if (!desc || irq_settings_can_autoenable(desc) || + !irq_settings_can_request(desc) || + WARN_ON(irq_settings_is_per_cpu_devid(desc)) || + !irq_supports_nmi(desc)) + return -EINVAL; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING; + action->name = name; + action->dev_id = dev_id; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + goto err_out; + + retval = __setup_irq(irq, desc, action); + if (retval) + goto err_irq_setup; + + raw_spin_lock_irqsave(&desc->lock, flags); + + /* Setup NMI state */ + desc->istate |= IRQS_NMI; + retval = irq_nmi_setup(desc); + if (retval) { + __cleanup_nmi(irq, desc); + raw_spin_unlock_irqrestore(&desc->lock, flags); + return -EINVAL; + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return 0; + +err_irq_setup: + irq_chip_pm_put(&desc->irq_data); +err_out: + kfree(action); + + return retval; +} + void enable_percpu_irq(unsigned int irq, unsigned int type) { unsigned int cpu = smp_processor_id(); @@ -1959,6 +2189,11 @@ out: } EXPORT_SYMBOL_GPL(enable_percpu_irq); +void enable_percpu_nmi(unsigned int irq, unsigned int type) +{ + enable_percpu_irq(irq, type); +} + /** * irq_percpu_is_enabled - Check whether the per cpu irq is enabled * @irq: Linux irq number to check for @@ -1998,6 +2233,11 @@ void disable_percpu_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(disable_percpu_irq); +void disable_percpu_nmi(unsigned int irq) +{ + disable_percpu_irq(irq); +} + /* * Internal function to unregister a percpu irqaction. */ @@ -2029,6 +2269,8 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ /* Found it - now remove it from the list of entries: */ desc->action = NULL; + desc->istate &= ~IRQS_NMI; + raw_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -2082,6 +2324,19 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) } EXPORT_SYMBOL_GPL(free_percpu_irq); +void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc || !irq_settings_is_per_cpu_devid(desc)) + return; + + if (WARN_ON(!(desc->istate & IRQS_NMI))) + return; + + kfree(__free_percpu_irq(irq, dev_id)); +} + /** * setup_percpu_irq - setup a per-cpu interrupt * @irq: Interrupt line to setup @@ -2172,6 +2427,158 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL_GPL(__request_percpu_irq); /** + * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @name: An ascii name for the claiming device + * @dev_id: A percpu cookie passed back to the handler function + * + * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs + * have to be setup on each CPU by calling prepare_percpu_nmi() before + * being enabled on the same CPU by using enable_percpu_nmi(). + * + * Dev_id must be globally unique. It is a per-cpu variable, and + * the handler gets called with the interrupted CPU's instance of + * that variable. + * + * Interrupt lines requested for NMI delivering should have auto enabling + * setting disabled. + * + * If the interrupt line cannot be used to deliver NMIs, function + * will fail returning a negative value. + */ +int request_percpu_nmi(unsigned int irq, irq_handler_t handler, + const char *name, void __percpu *dev_id) +{ + struct irqaction *action; + struct irq_desc *desc; + unsigned long flags; + int retval; + + if (!handler) + return -EINVAL; + + desc = irq_to_desc(irq); + + if (!desc || !irq_settings_can_request(desc) || + !irq_settings_is_per_cpu_devid(desc) || + irq_settings_can_autoenable(desc) || + !irq_supports_nmi(desc)) + return -EINVAL; + + /* The line cannot already be NMI */ + if (desc->istate & IRQS_NMI) + return -EINVAL; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD + | IRQF_NOBALANCING; + action->name = name; + action->percpu_dev_id = dev_id; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + goto err_out; + + retval = __setup_irq(irq, desc, action); + if (retval) + goto err_irq_setup; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc->istate |= IRQS_NMI; + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return 0; + +err_irq_setup: + irq_chip_pm_put(&desc->irq_data); +err_out: + kfree(action); + + return retval; +} + +/** + * prepare_percpu_nmi - performs CPU local setup for NMI delivery + * @irq: Interrupt line to prepare for NMI delivery + * + * This call prepares an interrupt line to deliver NMI on the current CPU, + * before that interrupt line gets enabled with enable_percpu_nmi(). + * + * As a CPU local operation, this should be called from non-preemptible + * context. + * + * If the interrupt line cannot be used to deliver NMIs, function + * will fail returning a negative value. + */ +int prepare_percpu_nmi(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc; + int ret = 0; + + WARN_ON(preemptible()); + + desc = irq_get_desc_lock(irq, &flags, + IRQ_GET_DESC_CHECK_PERCPU); + if (!desc) + return -EINVAL; + + if (WARN(!(desc->istate & IRQS_NMI), + KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", + irq)) { + ret = -EINVAL; + goto out; + } + + ret = irq_nmi_setup(desc); + if (ret) { + pr_err("Failed to setup NMI delivery: irq %u\n", irq); + goto out; + } + +out: + irq_put_desc_unlock(desc, flags); + return ret; +} + +/** + * teardown_percpu_nmi - undoes NMI setup of IRQ line + * @irq: Interrupt line from which CPU local NMI configuration should be + * removed + * + * This call undoes the setup done by prepare_percpu_nmi(). + * + * IRQ line should not be enabled for the current CPU. + * + * As a CPU local operation, this should be called from non-preemptible + * context. + */ +void teardown_percpu_nmi(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc; + + WARN_ON(preemptible()); + + desc = irq_get_desc_lock(irq, &flags, + IRQ_GET_DESC_CHECK_PERCPU); + if (!desc) + return; + + if (WARN_ON(!(desc->istate & IRQS_NMI))) + goto out; + + irq_nmi_teardown(desc); +out: + irq_put_desc_unlock(desc, flags); +} + +/** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: One of IRQCHIP_STATE_* the caller wants to know about diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6d2fa6914b30..2ed97a7c9b2a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) */ raw_spin_lock_irqsave(&desc->lock, flags); for_each_action_of_desc(desc, action) { - printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); + printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler); if (action->thread_fn) - printk(KERN_CONT " threaded [<%p>] %pf", + printk(KERN_CONT " threaded [<%p>] %ps", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); } diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 1e4cb63a5c82..90c735da15d0 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -9,6 +9,7 @@ #include <linux/idr.h> #include <linux/irq.h> #include <linux/math64.h> +#include <linux/log2.h> #include <trace/events/irq.h> @@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); DEFINE_PER_CPU(struct irq_timings, irq_timings); -struct irqt_stat { - u64 next_evt; - u64 last_ts; - u64 variance; - u32 avg; - u32 nr_samples; - int anomalies; - int valid; -}; - static DEFINE_IDR(irqt_stats); void irq_timings_enable(void) @@ -40,75 +31,360 @@ void irq_timings_disable(void) static_branch_disable(&irq_timing_enabled); } -/** - * irqs_update - update the irq timing statistics with a new timestamp +/* + * The main goal of this algorithm is to predict the next interrupt + * occurrence on the current CPU. + * + * Currently, the interrupt timings are stored in a circular array + * buffer every time there is an interrupt, as a tuple: the interrupt + * number and the associated timestamp when the event occurred <irq, + * timestamp>. + * + * For every interrupt occurring in a short period of time, we can + * measure the elapsed time between the occurrences for the same + * interrupt and we end up with a suite of intervals. The experience + * showed the interrupts are often coming following a periodic + * pattern. + * + * The objective of the algorithm is to find out this periodic pattern + * in a fastest way and use its period to predict the next irq event. + * + * When the next interrupt event is requested, we are in the situation + * where the interrupts are disabled and the circular buffer + * containing the timings is filled with the events which happened + * after the previous next-interrupt-event request. + * + * At this point, we read the circular buffer and we fill the irq + * related statistics structure. After this step, the circular array + * containing the timings is empty because all the values are + * dispatched in their corresponding buffers. + * + * Now for each interrupt, we can predict the next event by using the + * suffix array, log interval and exponential moving average + * + * 1. Suffix array + * + * Suffix array is an array of all the suffixes of a string. It is + * widely used as a data structure for compression, text search, ... + * For instance for the word 'banana', the suffixes will be: 'banana' + * 'anana' 'nana' 'ana' 'na' 'a' + * + * Usually, the suffix array is sorted but for our purpose it is + * not necessary and won't provide any improvement in the context of + * the solved problem where we clearly define the boundaries of the + * search by a max period and min period. + * + * The suffix array will build a suite of intervals of different + * length and will look for the repetition of each suite. If the suite + * is repeating then we have the period because it is the length of + * the suite whatever its position in the buffer. + * + * 2. Log interval + * + * We saw the irq timings allow to compute the interval of the + * occurrences for a specific interrupt. We can reasonibly assume the + * longer is the interval, the higher is the error for the next event + * and we can consider storing those interval values into an array + * where each slot in the array correspond to an interval at the power + * of 2 of the index. For example, index 12 will contain values + * between 2^11 and 2^12. + * + * At the end we have an array of values where at each index defines a + * [2^index - 1, 2 ^ index] interval values allowing to store a large + * number of values inside a small array. + * + * For example, if we have the value 1123, then we store it at + * ilog2(1123) = 10 index value. + * + * Storing those value at the specific index is done by computing an + * exponential moving average for this specific slot. For instance, + * for values 1800, 1123, 1453, ... fall under the same slot (10) and + * the exponential moving average is computed every time a new value + * is stored at this slot. + * + * 3. Exponential Moving Average + * + * The EMA is largely used to track a signal for stocks or as a low + * pass filter. The magic of the formula, is it is very simple and the + * reactivity of the average can be tuned with the factors called + * alpha. + * + * The higher the alphas are, the faster the average respond to the + * signal change. In our case, if a slot in the array is a big + * interval, we can have numbers with a big difference between + * them. The impact of those differences in the average computation + * can be tuned by changing the alpha value. + * + * + * -- The algorithm -- + * + * We saw the different processing above, now let's see how they are + * used together. + * + * For each interrupt: + * For each interval: + * Compute the index = ilog2(interval) + * Compute a new_ema(buffer[index], interval) + * Store the index in a circular buffer + * + * Compute the suffix array of the indexes + * + * For each suffix: + * If the suffix is reverse-found 3 times + * Return suffix + * + * Return Not found + * + * However we can not have endless suffix array to be build, it won't + * make sense and it will add an extra overhead, so we can restrict + * this to a maximum suffix length of 5 and a minimum suffix length of + * 2. The experience showed 5 is the majority of the maximum pattern + * period found for different devices. + * + * The result is a pattern finding less than 1us for an interrupt. * - * @irqs: an irqt_stat struct pointer - * @ts: the new timestamp + * Example based on real values: * - * The statistics are computed online, in other words, the code is - * designed to compute the statistics on a stream of values rather - * than doing multiple passes on the values to compute the average, - * then the variance. The integer division introduces a loss of - * precision but with an acceptable error margin regarding the results - * we would have with the double floating precision: we are dealing - * with nanosec, so big numbers, consequently the mantisse is - * negligeable, especially when converting the time in usec - * afterwards. + * Example 1 : MMC write/read interrupt interval: * - * The computation happens at idle time. When the CPU is not idle, the - * interrupts' timestamps are stored in the circular buffer, when the - * CPU goes idle and this routine is called, all the buffer's values - * are injected in the statistical model continuying to extend the - * statistics from the previous busy-idle cycle. + * 223947, 1240, 1384, 1386, 1386, + * 217416, 1236, 1384, 1386, 1387, + * 214719, 1241, 1386, 1387, 1384, + * 213696, 1234, 1384, 1386, 1388, + * 219904, 1240, 1385, 1389, 1385, + * 212240, 1240, 1386, 1386, 1386, + * 214415, 1236, 1384, 1386, 1387, + * 214276, 1234, 1384, 1388, ? * - * The observations showed a device will trigger a burst of periodic - * interrupts followed by one or two peaks of longer time, for - * instance when a SD card device flushes its cache, then the periodic - * intervals occur again. A one second inactivity period resets the - * stats, that gives us the certitude the statistical values won't - * exceed 1x10^9, thus the computation won't overflow. + * For each element, apply ilog2(value) * - * Basically, the purpose of the algorithm is to watch the periodic - * interrupts and eliminate the peaks. + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, ? * - * An interrupt is considered periodically stable if the interval of - * its occurences follow the normal distribution, thus the values - * comply with: + * Max period of 5, we take the last (max_period * 3) 15 elements as + * we can be confident if the pattern repeats itself three times it is + * a repeating pattern. * - * avg - 3 x stddev < value < avg + 3 x stddev + * 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, ? * - * Which can be simplified to: + * Suffixes are: * - * -3 x stddev < value - avg < 3 x stddev + * 1) 8, 15, 8, 8, 8 <- max period + * 2) 8, 15, 8, 8 + * 3) 8, 15, 8 + * 4) 8, 15 <- min period * - * abs(value - avg) < 3 x stddev + * From there we search the repeating pattern for each suffix. * - * In order to save a costly square root computation, we use the - * variance. For the record, stddev = sqrt(variance). The equation - * above becomes: + * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8 + * | | | | | | | | | | | | | | | + * 8, 15, 8, 8, 8 | | | | | | | | | | + * 8, 15, 8, 8, 8 | | | | | + * 8, 15, 8, 8, 8 * - * abs(value - avg) < 3 x sqrt(variance) + * When moving the suffix, we found exactly 3 matches. * - * And finally we square it: + * The first suffix with period 5 is repeating. * - * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 + * The next event is (3 * max_period) % suffix_period * - * (value - avg) x (value - avg) < 9 x variance + * In this example, the result 0, so the next event is suffix[0] => 8 * - * Statistically speaking, any values out of this interval is - * considered as an anomaly and is discarded. However, a normal - * distribution appears when the number of samples is 30 (it is the - * rule of thumb in statistics, cf. "30 samples" on Internet). When - * there are three consecutive anomalies, the statistics are resetted. + * However, 8 is the index in the array of exponential moving average + * which was calculated on the fly when storing the values, so the + * interval is ema[8] = 1366 * + * + * Example 2: + * + * 4, 3, 5, 100, + * 3, 3, 5, 117, + * 4, 4, 5, 112, + * 4, 3, 4, 110, + * 3, 5, 3, 117, + * 4, 4, 5, 112, + * 4, 3, 4, 110, + * 3, 4, 5, 112, + * 4, 3, 4, 110 + * + * ilog2 + * + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4 + * + * Max period 5: + * 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4 + * + * Suffixes: + * + * 1) 0, 0, 4, 0, 0 + * 2) 0, 0, 4, 0 + * 3) 0, 0, 4 + * 4) 0, 0 + * + * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 + * | | | | | | X + * 0, 0, 4, 0, 0, | X + * 0, 0 + * + * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 + * | | | | | | | | | | | | | | | + * 0, 0, 4, 0, | | | | | | | | | | | + * 0, 0, 4, 0, | | | | | | | + * 0, 0, 4, 0, | | | + * 0 0 4 + * + * Pattern is found 3 times, the remaining is 1 which results from + * (max_period * 3) % suffix_period. This value is the index in the + * suffix arrays. The suffix array for a period 4 has the value 4 + * at index 1. + */ +#define EMA_ALPHA_VAL 64 +#define EMA_ALPHA_SHIFT 7 + +#define PREDICTION_PERIOD_MIN 2 +#define PREDICTION_PERIOD_MAX 5 +#define PREDICTION_FACTOR 4 +#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ +#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ + +struct irqt_stat { + u64 last_ts; + u64 ema_time[PREDICTION_BUFFER_SIZE]; + int timings[IRQ_TIMINGS_SIZE]; + int circ_timings[IRQ_TIMINGS_SIZE]; + int count; +}; + +/* + * Exponential moving average computation */ -static void irqs_update(struct irqt_stat *irqs, u64 ts) +static u64 irq_timings_ema_new(u64 value, u64 ema_old) +{ + s64 diff; + + if (unlikely(!ema_old)) + return value; + + diff = (value - ema_old) * EMA_ALPHA_VAL; + /* + * We can use a s64 type variable to be added with the u64 + * ema_old variable as this one will never have its topmost + * bit set, it will be always smaller than 2^63 nanosec + * interrupt interval (292 years). + */ + return ema_old + (diff >> EMA_ALPHA_SHIFT); +} + +static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) +{ + int i; + + /* + * The buffer contains the suite of intervals, in a ilog2 + * basis, we are looking for a repetition. We point the + * beginning of the search three times the length of the + * period beginning at the end of the buffer. We do that for + * each suffix. + */ + for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) { + + int *begin = &buffer[len - (i * 3)]; + int *ptr = begin; + + /* + * We look if the suite with period 'i' repeat + * itself. If it is truncated at the end, as it + * repeats we can use the period to find out the next + * element. + */ + while (!memcmp(ptr, begin, i * sizeof(*ptr))) { + ptr += i; + if (ptr >= &buffer[len]) + return begin[((i * 3) % i)]; + } + } + + return -1; +} + +static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) +{ + int index, i, period_max, count, start, min = INT_MAX; + + if ((now - irqs->last_ts) >= NSEC_PER_SEC) { + irqs->count = irqs->last_ts = 0; + return U64_MAX; + } + + /* + * As we want to find three times the repetition, we need a + * number of intervals greater or equal to three times the + * maximum period, otherwise we truncate the max period. + */ + period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ? + PREDICTION_PERIOD_MAX : irqs->count / 3; + + /* + * If we don't have enough irq timings for this prediction, + * just bail out. + */ + if (period_max <= PREDICTION_PERIOD_MIN) + return U64_MAX; + + /* + * 'count' will depends if the circular buffer wrapped or not + */ + count = irqs->count < IRQ_TIMINGS_SIZE ? + irqs->count : IRQ_TIMINGS_SIZE; + + start = irqs->count < IRQ_TIMINGS_SIZE ? + 0 : (irqs->count & IRQ_TIMINGS_MASK); + + /* + * Copy the content of the circular buffer into another buffer + * in order to linearize the buffer instead of dealing with + * wrapping indexes and shifted array which will be prone to + * error and extremelly difficult to debug. + */ + for (i = 0; i < count; i++) { + int index = (start + i) & IRQ_TIMINGS_MASK; + + irqs->timings[i] = irqs->circ_timings[index]; + min = min_t(int, irqs->timings[i], min); + } + + index = irq_timings_next_event_index(irqs->timings, count, period_max); + if (index < 0) + return irqs->last_ts + irqs->ema_time[min]; + + return irqs->last_ts + irqs->ema_time[index]; +} + +static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) { u64 old_ts = irqs->last_ts; - u64 variance = 0; u64 interval; - s64 diff; + int index; /* * The timestamps are absolute time values, we need to compute @@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts) * want as we need another timestamp to compute an interval. */ if (interval >= NSEC_PER_SEC) { - memset(irqs, 0, sizeof(*irqs)); - irqs->last_ts = ts; + irqs->count = 0; return; } /* - * Pre-compute the delta with the average as the result is - * used several times in this function. - */ - diff = interval - irqs->avg; - - /* - * Increment the number of samples. - */ - irqs->nr_samples++; - - /* - * Online variance divided by the number of elements if there - * is more than one sample. Normally the formula is division - * by nr_samples - 1 but we assume the number of element will be - * more than 32 and dividing by 32 instead of 31 is enough - * precise. - */ - if (likely(irqs->nr_samples > 1)) - variance = irqs->variance >> IRQ_TIMINGS_SHIFT; - - /* - * The rule of thumb in statistics for the normal distribution - * is having at least 30 samples in order to have the model to - * apply. Values outside the interval are considered as an - * anomaly. - */ - if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { - /* - * After three consecutive anomalies, we reset the - * stats as it is no longer stable enough. - */ - if (irqs->anomalies++ >= 3) { - memset(irqs, 0, sizeof(*irqs)); - irqs->last_ts = ts; - return; - } - } else { - /* - * The anomalies must be consecutives, so at this - * point, we reset the anomalies counter. - */ - irqs->anomalies = 0; - } - - /* - * The interrupt is considered stable enough to try to predict - * the next event on it. + * Get the index in the ema table for this interrupt. The + * PREDICTION_FACTOR increase the interval size for the array + * of exponential average. */ - irqs->valid = 1; + index = likely(interval) ? + ilog2((interval >> 10) / PREDICTION_FACTOR) : 0; /* - * Online average algorithm: - * - * new_average = average + ((value - average) / count) - * - * The variance computation depends on the new average - * to be computed here first. - * + * Store the index as an element of the pattern in another + * circular array. */ - irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); + irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; - /* - * Online variance algorithm: - * - * new_variance = variance + (value - average) x (value - new_average) - * - * Warning: irqs->avg is updated with the line above, hence - * 'interval - irqs->avg' is no longer equal to 'diff' - */ - irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); + irqs->ema_time[index] = irq_timings_ema_new(interval, + irqs->ema_time[index]); - /* - * Update the next event - */ - irqs->next_evt = ts + irqs->avg; + irqs->count++; } /** @@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now) */ lockdep_assert_irqs_disabled(); + if (!irqts->count) + return next_evt; + /* * Number of elements in the circular buffer: If it happens it * was flushed before, then the number of elements could be @@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now) * type but with the cost of extra computation in the * interrupt handler hot path. We choose efficiency. * - * Inject measured irq/timestamp to the statistical model - * while decrementing the counter because we consume the data - * from our circular buffer. + * Inject measured irq/timestamp to the pattern prediction + * model while decrementing the counter because we consume the + * data from our circular buffer. */ - for (i = irqts->count & IRQ_TIMINGS_MASK, - irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); - irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { - irq = irq_timing_decode(irqts->values[i], &ts); + i = (irqts->count & IRQ_TIMINGS_MASK) - 1; + irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); + for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { + irq = irq_timing_decode(irqts->values[i], &ts); s = idr_find(&irqt_stats, irq); - if (s) { - irqs = this_cpu_ptr(s); - irqs_update(irqs, ts); - } + if (s) + irq_timings_store(irq, this_cpu_ptr(s), ts); } /* @@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now) irqs = this_cpu_ptr(s); - if (!irqs->valid) - continue; + ts = __irq_timings_next_event(irqs, i, now); + if (ts <= now) + return now; - if (irqs->next_evt <= now) { - irq = i; - next_evt = now; - - /* - * This interrupt mustn't use in the future - * until new events occur and update the - * statistics. - */ - irqs->valid = 0; - break; - } - - if (irqs->next_evt < next_evt) { - irq = i; - next_evt = irqs->next_evt; - } + if (ts < next_evt) + next_evt = ts; } return next_evt; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 6b7cdf17ccf8..73288914ed5e 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -56,61 +56,70 @@ void __weak arch_irq_work_raise(void) */ } -/* - * Enqueue the irq_work @work on @cpu unless it's already pending - * somewhere. - * - * Can be re-enqueued while the callback is still in progress. - */ -bool irq_work_queue_on(struct irq_work *work, int cpu) +/* Enqueue on current CPU, work must already be claimed and preempt disabled */ +static void __irq_work_queue_local(struct irq_work *work) { - /* All work should have been flushed before going offline */ - WARN_ON_ONCE(cpu_is_offline(cpu)); - -#ifdef CONFIG_SMP - - /* Arch remote IPI send/receive backend aren't NMI safe */ - WARN_ON_ONCE(in_nmi()); + /* If the work is "lazy", handle it from next tick if any */ + if (work->flags & IRQ_WORK_LAZY) { + if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && + tick_nohz_tick_stopped()) + arch_irq_work_raise(); + } else { + if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) + arch_irq_work_raise(); + } +} +/* Enqueue the irq work @work on the current CPU */ +bool irq_work_queue(struct irq_work *work) +{ /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) - arch_send_call_function_single_ipi(cpu); - -#else /* #ifdef CONFIG_SMP */ - irq_work_queue(work); -#endif /* #else #ifdef CONFIG_SMP */ + /* Queue the entry and raise the IPI if needed. */ + preempt_disable(); + __irq_work_queue_local(work); + preempt_enable(); return true; } +EXPORT_SYMBOL_GPL(irq_work_queue); -/* Enqueue the irq work @work on the current CPU */ -bool irq_work_queue(struct irq_work *work) +/* + * Enqueue the irq_work @work on @cpu unless it's already pending + * somewhere. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue_on(struct irq_work *work, int cpu) { +#ifndef CONFIG_SMP + return irq_work_queue(work); + +#else /* CONFIG_SMP: */ + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(cpu)); + /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; - /* Queue the entry and raise the IPI if needed. */ preempt_disable(); - - /* If the work is "lazy", handle it from next tick if any */ - if (work->flags & IRQ_WORK_LAZY) { - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && - tick_nohz_tick_stopped()) - arch_irq_work_raise(); + if (cpu != smp_processor_id()) { + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); + if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) + arch_send_call_function_single_ipi(cpu); } else { - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) - arch_irq_work_raise(); + __irq_work_queue_local(work); } - preempt_enable(); return true; +#endif /* CONFIG_SMP */ } -EXPORT_SYMBOL_GPL(irq_work_queue); + bool irq_work_needs_cpu(void) { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index bad96b476eb6..de6efdecc70d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -202,11 +202,13 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static void __static_key_slow_dec_cpuslocked(struct static_key *key, - unsigned long rate_limit, - struct delayed_work *work) +static bool static_key_slow_try_dec(struct static_key *key) { - lockdep_assert_cpus_held(); + int val; + + val = atomic_fetch_add_unless(&key->enabled, -1, 1); + if (val == 1) + return false; /* * The negative count check is valid even when a negative @@ -215,63 +217,70 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key, * returns is unbalanced, because all other static_key_slow_inc() * instances block while the update is in progress. */ - if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { - WARN(atomic_read(&key->enabled) < 0, - "jump label: negative count!\n"); + WARN(val < 0, "jump label: negative count!\n"); + return true; +} + +static void __static_key_slow_dec_cpuslocked(struct static_key *key) +{ + lockdep_assert_cpus_held(); + + if (static_key_slow_try_dec(key)) return; - } - if (rate_limit) { - atomic_inc(&key->enabled); - schedule_delayed_work(work, rate_limit); - } else { + jump_label_lock(); + if (atomic_dec_and_test(&key->enabled)) jump_label_update(key); - } jump_label_unlock(); } -static void __static_key_slow_dec(struct static_key *key, - unsigned long rate_limit, - struct delayed_work *work) +static void __static_key_slow_dec(struct static_key *key) { cpus_read_lock(); - __static_key_slow_dec_cpuslocked(key, rate_limit, work); + __static_key_slow_dec_cpuslocked(key); cpus_read_unlock(); } -static void jump_label_update_timeout(struct work_struct *work) +void jump_label_update_timeout(struct work_struct *work) { struct static_key_deferred *key = container_of(work, struct static_key_deferred, work.work); - __static_key_slow_dec(&key->key, 0, NULL); + __static_key_slow_dec(&key->key); } +EXPORT_SYMBOL_GPL(jump_label_update_timeout); void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); - __static_key_slow_dec(key, 0, NULL); + __static_key_slow_dec(key); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); - __static_key_slow_dec_cpuslocked(key, 0, NULL); + __static_key_slow_dec_cpuslocked(key); } -void static_key_slow_dec_deferred(struct static_key_deferred *key) +void __static_key_slow_dec_deferred(struct static_key *key, + struct delayed_work *work, + unsigned long timeout) { STATIC_KEY_CHECK_USE(key); - __static_key_slow_dec(&key->key, key->timeout, &key->work); + + if (static_key_slow_try_dec(key)) + return; + + schedule_delayed_work(work, timeout); } -EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); +EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); -void static_key_deferred_flush(struct static_key_deferred *key) +void __static_key_deferred_flush(void *key, struct delayed_work *work) { STATIC_KEY_CHECK_USE(key); - flush_delayed_work(&key->work); + flush_delayed_work(work); } -EXPORT_SYMBOL_GPL(static_key_deferred_flush); +EXPORT_SYMBOL_GPL(__static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index f3a04994e063..14934afa9e68 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -494,7 +494,7 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { - iter->module_name[0] = '\0'; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, diff --git a/kernel/kcov.c b/kernel/kcov.c index c2277dbdbfb1..2ee38727844a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -20,6 +20,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/kcov.h> +#include <linux/refcount.h> #include <asm/setup.h> /* Number of 64-bit words written per one comparison: */ @@ -44,7 +45,7 @@ struct kcov { * - opened file descriptor * - task with enabled coverage (we can't unwire it from another task) */ - atomic_t refcount; + refcount_t refcount; /* The lock protects mode, size, area and t. */ spinlock_t lock; enum kcov_mode mode; @@ -228,12 +229,12 @@ EXPORT_SYMBOL(__sanitizer_cov_trace_switch); static void kcov_get(struct kcov *kcov) { - atomic_inc(&kcov->refcount); + refcount_inc(&kcov->refcount); } static void kcov_put(struct kcov *kcov) { - if (atomic_dec_and_test(&kcov->refcount)) { + if (refcount_dec_and_test(&kcov->refcount)) { vfree(kcov->area); kfree(kcov); } @@ -312,7 +313,7 @@ static int kcov_open(struct inode *inode, struct file *filep) if (!kcov) return -ENOMEM; kcov->mode = KCOV_MODE_DISABLED; - atomic_set(&kcov->refcount, 1); + refcount_set(&kcov->refcount, 1); spin_lock_init(&kcov->lock); filep->private_data = kcov; return nonseekable_open(inode, filep); @@ -444,10 +445,8 @@ static int __init kcov_init(void) * there is no need to protect it against removal races. The * use of debugfs_create_file_unsafe() is actually safe here. */ - if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { - pr_err("failed to create kcov in debugfs\n"); - return -ENOMEM; - } + debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops); + return 0; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d7140447be75..fd5c95ff9251 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1150,7 +1150,7 @@ int kernel_kexec(void) error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; local_irq_disable(); @@ -1183,7 +1183,7 @@ int kernel_kexec(void) Enable_irqs: local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f1d0e00a3971..f7fb8f6a688f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -688,7 +688,6 @@ static int kexec_calculate_store_digests(struct kimage *image) goto out_free_desc; desc->tfm = tfm; - desc->flags = 0; ret = crypto_shash_init(desc); if (ret < 0) diff --git a/kernel/kheaders.c b/kernel/kheaders.c new file mode 100644 index 000000000000..70ae6052920d --- /dev/null +++ b/kernel/kheaders.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel headers useful to build tracing programs + * such as for running eBPF tracing tools. + * + * (Borrowed code from kernel/configs.c) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/uaccess.h> + +/* + * Define kernel_headers_data and kernel_headers_data_end, within which the + * compressed kernel headers are stored. The file is first compressed with xz. + */ + +asm ( +" .pushsection .rodata, \"a\" \n" +" .global kernel_headers_data \n" +"kernel_headers_data: \n" +" .incbin \"kernel/kheaders_data.tar.xz\" \n" +" .global kernel_headers_data_end \n" +"kernel_headers_data_end: \n" +" .popsection \n" +); + +extern char kernel_headers_data; +extern char kernel_headers_data_end; + +static ssize_t +ikheaders_read_current(struct file *file, char __user *buf, + size_t len, loff_t *offset) +{ + return simple_read_from_buffer(buf, len, offset, + &kernel_headers_data, + &kernel_headers_data_end - + &kernel_headers_data); +} + +static const struct file_operations ikheaders_file_ops = { + .read = ikheaders_read_current, + .llseek = default_llseek, +}; + +static int __init ikheaders_init(void) +{ + struct proc_dir_entry *entry; + + /* create the current headers file */ + entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL, + &ikheaders_file_ops); + if (!entry) + return -ENOMEM; + + proc_set_size(entry, + &kernel_headers_data_end - + &kernel_headers_data); + return 0; +} + +static void __exit ikheaders_cleanup(void) +{ + remove_proc_entry("kheaders.tar.xz", NULL); +} + +module_init(ikheaders_init); +module_exit(ikheaders_cleanup); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joel Fernandes"); +MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel"); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f4ddfdd2d07e..b1ea30a5540e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -709,7 +709,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) static int reuse_unused_kprobe(struct kprobe *ap) { struct optimized_kprobe *op; - int ret; /* * Unused kprobe MUST be on the way of delayed unoptimizing (means @@ -720,9 +719,8 @@ static int reuse_unused_kprobe(struct kprobe *ap) /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ - ret = kprobe_optready(ap); - if (ret) - return ret; + if (!kprobe_optready(ap)) + return -EINVAL; optimize_kprobe(ap); return 0; @@ -1396,7 +1394,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -bool within_kprobe_blacklist(unsigned long addr) +static bool __within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; @@ -1410,7 +1408,26 @@ bool within_kprobe_blacklist(unsigned long addr) if (addr >= ent->start_addr && addr < ent->end_addr) return true; } + return false; +} +bool within_kprobe_blacklist(unsigned long addr) +{ + char symname[KSYM_NAME_LEN], *p; + + if (__within_kprobe_blacklist(addr)) + return true; + + /* Check if the address is on a suffixed-symbol */ + if (!lookup_symbol_name(addr, symname)) { + p = strchr(symname, '.'); + if (!p) + return false; + *p = '\0'; + addr = (unsigned long)kprobe_lookup_name(symname, 0); + if (addr) + return __within_kprobe_blacklist(addr); + } return false; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 087d18d771b5..5942eeafb9ac 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,6 +20,7 @@ #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> +#include <linux/numa.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -101,6 +102,12 @@ bool kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); +bool __kthread_should_park(struct task_struct *k) +{ + return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); +} +EXPORT_SYMBOL_GPL(__kthread_should_park); + /** * kthread_should_park - should this kthread park now? * @@ -114,7 +121,7 @@ EXPORT_SYMBOL(kthread_should_stop); */ bool kthread_should_park(void) { - return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); + return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); @@ -599,7 +606,7 @@ void __kthread_init_worker(struct kthread_worker *worker, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); - spin_lock_init(&worker->lock); + raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); @@ -641,21 +648,21 @@ repeat: if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); worker->task = NULL; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); if (work) { __set_current_state(TASK_RUNNING); @@ -675,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags, { struct kthread_worker *worker; struct task_struct *task; - int node = -1; + int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) @@ -812,12 +819,12 @@ bool kthread_queue_work(struct kthread_worker *worker, bool ret = false; unsigned long flags; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); @@ -835,6 +842,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; + unsigned long flags; /* * This might happen when a pending work is reinitialized. @@ -843,7 +851,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) if (WARN_ON_ONCE(!worker)) return; - spin_lock(&worker->lock); + raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -852,7 +860,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) list_del_init(&work->node); kthread_insert_work(worker, work, &worker->work_list); - spin_unlock(&worker->lock); + raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); @@ -908,14 +916,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker, unsigned long flags; bool ret = false; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); @@ -951,7 +959,7 @@ void kthread_flush_work(struct kthread_work *work) if (!worker) return; - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -963,7 +971,7 @@ void kthread_flush_work(struct kthread_work *work) else noop = true; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); @@ -996,9 +1004,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, * any queuing is blocked by setting the canceling counter. */ work->canceling++; - spin_unlock_irqrestore(&worker->lock, *flags); + raw_spin_unlock_irqrestore(&worker->lock, *flags); del_timer_sync(&dwork->timer); - spin_lock_irqsave(&worker->lock, *flags); + raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } @@ -1045,7 +1053,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, unsigned long flags; int ret = false; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) @@ -1062,7 +1070,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); @@ -1076,7 +1084,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) if (!worker) goto out; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -1090,13 +1098,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 96b4179cee6a..99a5b5f46dc5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -120,8 +120,8 @@ account_global_scheduler_latency(struct task_struct *tsk, break; } - /* 0 and ULONG_MAX entries mean end of backtrace: */ - if (record == 0 || record == ULONG_MAX) + /* 0 entry marks end of backtrace: */ + if (!record) break; } if (same) { @@ -141,20 +141,6 @@ account_global_scheduler_latency(struct task_struct *tsk, memcpy(&latency_record[i], lat, sizeof(struct latency_record)); } -/* - * Iterator to store a backtrace into a latency record entry - */ -static inline void store_stacktrace(struct task_struct *tsk, - struct latency_record *lat) -{ - struct stack_trace trace; - - memset(&trace, 0, sizeof(trace)); - trace.max_entries = LT_BACKTRACEDEPTH; - trace.entries = &lat->backtrace[0]; - save_stack_trace_tsk(tsk, &trace); -} - /** * __account_scheduler_latency - record an occurred latency * @tsk - the task struct of the task hitting the latency @@ -191,7 +177,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) lat.count = 1; lat.time = usecs; lat.max = usecs; - store_stacktrace(tsk, &lat); + + stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0); raw_spin_lock_irqsave(&latency_lock, flags); @@ -210,8 +197,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) break; } - /* 0 and ULONG_MAX entries mean end of backtrace: */ - if (record == 0 || record == ULONG_MAX) + /* 0 entry is end of backtrace */ + if (!record) break; } if (same) { @@ -252,10 +239,10 @@ static int lstats_show(struct seq_file *m, void *v) lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (bt == ULONG_MAX) - break; + seq_printf(m, " %ps", (void *)bt); } seq_puts(m, "\n"); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 5b77a7314e01..f6fbaff10e71 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -45,7 +45,12 @@ */ DEFINE_MUTEX(klp_mutex); -static LIST_HEAD(klp_patches); +/* + * Actively used patches: enabled or in transition. Note that replaced + * or disabled patches are not listed even though the related kernel + * module still can be loaded. + */ +LIST_HEAD(klp_patches); static struct kobject *klp_root_kobj; @@ -82,20 +87,43 @@ static void klp_find_object_module(struct klp_object *obj) mutex_unlock(&module_mutex); } -static bool klp_is_patch_registered(struct klp_patch *patch) +static bool klp_initialized(void) +{ + return !!klp_root_kobj; +} + +static struct klp_func *klp_find_func(struct klp_object *obj, + struct klp_func *old_func) { - struct klp_patch *mypatch; + struct klp_func *func; - list_for_each_entry(mypatch, &klp_patches, list) - if (mypatch == patch) - return true; + klp_for_each_func(obj, func) { + if ((strcmp(old_func->old_name, func->old_name) == 0) && + (old_func->old_sympos == func->old_sympos)) { + return func; + } + } - return false; + return NULL; } -static bool klp_initialized(void) +static struct klp_object *klp_find_object(struct klp_patch *patch, + struct klp_object *old_obj) { - return !!klp_root_kobj; + struct klp_object *obj; + + klp_for_each_object(patch, obj) { + if (klp_is_module(old_obj)) { + if (klp_is_module(obj) && + strcmp(old_obj->name, obj->name) == 0) { + return obj; + } + } else if (!klp_is_module(obj)) { + return obj; + } + } + + return NULL; } struct klp_find_arg { @@ -278,170 +306,6 @@ static int klp_write_object_relocations(struct module *pmod, return ret; } -static int __klp_disable_patch(struct klp_patch *patch) -{ - struct klp_object *obj; - - if (WARN_ON(!patch->enabled)) - return -EINVAL; - - if (klp_transition_patch) - return -EBUSY; - - /* enforce stacking: only the last enabled patch can be disabled */ - if (!list_is_last(&patch->list, &klp_patches) && - list_next_entry(patch, list)->enabled) - return -EBUSY; - - klp_init_transition(patch, KLP_UNPATCHED); - - klp_for_each_object(patch, obj) - if (obj->patched) - klp_pre_unpatch_callback(obj); - - /* - * Enforce the order of the func->transition writes in - * klp_init_transition() and the TIF_PATCH_PENDING writes in - * klp_start_transition(). In the rare case where klp_ftrace_handler() - * is called shortly after klp_update_patch_state() switches the task, - * this ensures the handler sees that func->transition is set. - */ - smp_wmb(); - - klp_start_transition(); - klp_try_complete_transition(); - patch->enabled = false; - - return 0; -} - -/** - * klp_disable_patch() - disables a registered patch - * @patch: The registered, enabled patch to be disabled - * - * Unregisters the patched functions from ftrace. - * - * Return: 0 on success, otherwise error - */ -int klp_disable_patch(struct klp_patch *patch) -{ - int ret; - - mutex_lock(&klp_mutex); - - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } - - if (!patch->enabled) { - ret = -EINVAL; - goto err; - } - - ret = __klp_disable_patch(patch); - -err: - mutex_unlock(&klp_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(klp_disable_patch); - -static int __klp_enable_patch(struct klp_patch *patch) -{ - struct klp_object *obj; - int ret; - - if (klp_transition_patch) - return -EBUSY; - - if (WARN_ON(patch->enabled)) - return -EINVAL; - - /* enforce stacking: only the first disabled patch can be enabled */ - if (patch->list.prev != &klp_patches && - !list_prev_entry(patch, list)->enabled) - return -EBUSY; - - /* - * A reference is taken on the patch module to prevent it from being - * unloaded. - */ - if (!try_module_get(patch->mod)) - return -ENODEV; - - pr_notice("enabling patch '%s'\n", patch->mod->name); - - klp_init_transition(patch, KLP_PATCHED); - - /* - * Enforce the order of the func->transition writes in - * klp_init_transition() and the ops->func_stack writes in - * klp_patch_object(), so that klp_ftrace_handler() will see the - * func->transition updates before the handler is registered and the - * new funcs become visible to the handler. - */ - smp_wmb(); - - klp_for_each_object(patch, obj) { - if (!klp_is_object_loaded(obj)) - continue; - - ret = klp_pre_patch_callback(obj); - if (ret) { - pr_warn("pre-patch callback failed for object '%s'\n", - klp_is_module(obj) ? obj->name : "vmlinux"); - goto err; - } - - ret = klp_patch_object(obj); - if (ret) { - pr_warn("failed to patch object '%s'\n", - klp_is_module(obj) ? obj->name : "vmlinux"); - goto err; - } - } - - klp_start_transition(); - klp_try_complete_transition(); - patch->enabled = true; - - return 0; -err: - pr_warn("failed to enable patch '%s'\n", patch->mod->name); - - klp_cancel_transition(); - return ret; -} - -/** - * klp_enable_patch() - enables a registered patch - * @patch: The registered, disabled patch to be enabled - * - * Performs the needed symbol lookups and code relocations, - * then registers the patched functions with ftrace. - * - * Return: 0 on success, otherwise error - */ -int klp_enable_patch(struct klp_patch *patch) -{ - int ret; - - mutex_lock(&klp_mutex); - - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } - - ret = __klp_enable_patch(patch); - -err: - mutex_unlock(&klp_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(klp_enable_patch); - /* * Sysfs Interface * @@ -449,11 +313,11 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/<patch> * /sys/kernel/livepatch/<patch>/enabled * /sys/kernel/livepatch/<patch>/transition - * /sys/kernel/livepatch/<patch>/signal * /sys/kernel/livepatch/<patch>/force * /sys/kernel/livepatch/<patch>/<object> * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ +static int __klp_disable_patch(struct klp_patch *patch); static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -470,40 +334,32 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, mutex_lock(&klp_mutex); - if (!klp_is_patch_registered(patch)) { - /* - * Module with the patch could either disappear meanwhile or is - * not properly initialized yet. - */ - ret = -EINVAL; - goto err; - } - if (patch->enabled == enabled) { /* already in requested state */ ret = -EINVAL; - goto err; + goto out; } - if (patch == klp_transition_patch) { + /* + * Allow to reverse a pending transition in both ways. It might be + * necessary to complete the transition without forcing and breaking + * the system integrity. + * + * Do not allow to re-enable a disabled patch. + */ + if (patch == klp_transition_patch) klp_reverse_transition(); - } else if (enabled) { - ret = __klp_enable_patch(patch); - if (ret) - goto err; - } else { + else if (!enabled) ret = __klp_disable_patch(patch); - if (ret) - goto err; - } + else + ret = -EINVAL; +out: mutex_unlock(&klp_mutex); + if (ret) + return ret; return count; - -err: - mutex_unlock(&klp_mutex); - return ret; } static ssize_t enabled_show(struct kobject *kobj, @@ -525,35 +381,6 @@ static ssize_t transition_show(struct kobject *kobj, patch == klp_transition_patch); } -static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct klp_patch *patch; - int ret; - bool val; - - ret = kstrtobool(buf, &val); - if (ret) - return ret; - - if (!val) - return count; - - mutex_lock(&klp_mutex); - - patch = container_of(kobj, struct klp_patch, kobj); - if (patch != klp_transition_patch) { - mutex_unlock(&klp_mutex); - return -EINVAL; - } - - klp_send_signals(); - - mutex_unlock(&klp_mutex); - - return count; -} - static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -585,15 +412,132 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); -static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal); static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, - &signal_kobj_attr.attr, &force_kobj_attr.attr, NULL }; +ATTRIBUTE_GROUPS(klp_patch); + +static void klp_free_object_dynamic(struct klp_object *obj) +{ + kfree(obj->name); + kfree(obj); +} + +static void klp_init_func_early(struct klp_object *obj, + struct klp_func *func); +static void klp_init_object_early(struct klp_patch *patch, + struct klp_object *obj); + +static struct klp_object *klp_alloc_object_dynamic(const char *name, + struct klp_patch *patch) +{ + struct klp_object *obj; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return NULL; + + if (name) { + obj->name = kstrdup(name, GFP_KERNEL); + if (!obj->name) { + kfree(obj); + return NULL; + } + } + + klp_init_object_early(patch, obj); + obj->dynamic = true; + + return obj; +} + +static void klp_free_func_nop(struct klp_func *func) +{ + kfree(func->old_name); + kfree(func); +} + +static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func, + struct klp_object *obj) +{ + struct klp_func *func; + + func = kzalloc(sizeof(*func), GFP_KERNEL); + if (!func) + return NULL; + + if (old_func->old_name) { + func->old_name = kstrdup(old_func->old_name, GFP_KERNEL); + if (!func->old_name) { + kfree(func); + return NULL; + } + } + + klp_init_func_early(obj, func); + /* + * func->new_func is same as func->old_func. These addresses are + * set when the object is loaded, see klp_init_object_loaded(). + */ + func->old_sympos = old_func->old_sympos; + func->nop = true; + + return func; +} + +static int klp_add_object_nops(struct klp_patch *patch, + struct klp_object *old_obj) +{ + struct klp_object *obj; + struct klp_func *func, *old_func; + + obj = klp_find_object(patch, old_obj); + + if (!obj) { + obj = klp_alloc_object_dynamic(old_obj->name, patch); + if (!obj) + return -ENOMEM; + } + + klp_for_each_func(old_obj, old_func) { + func = klp_find_func(obj, old_func); + if (func) + continue; + + func = klp_alloc_func_nop(old_func, obj); + if (!func) + return -ENOMEM; + } + + return 0; +} + +/* + * Add 'nop' functions which simply return to the caller to run + * the original function. The 'nop' functions are added to a + * patch to facilitate a 'replace' mode. + */ +static int klp_add_nops(struct klp_patch *patch) +{ + struct klp_patch *old_patch; + struct klp_object *old_obj; + + klp_for_each_patch(old_patch) { + klp_for_each_object(old_patch, old_obj) { + int err; + + err = klp_add_object_nops(patch, old_obj); + if (err) + return err; + } + } + + return 0; +} static void klp_kobj_release_patch(struct kobject *kobj) { @@ -606,11 +550,17 @@ static void klp_kobj_release_patch(struct kobject *kobj) static struct kobj_type klp_ktype_patch = { .release = klp_kobj_release_patch, .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = klp_patch_attrs, + .default_groups = klp_patch_groups, }; static void klp_kobj_release_object(struct kobject *kobj) { + struct klp_object *obj; + + obj = container_of(kobj, struct klp_object, kobj); + + if (obj->dynamic) + klp_free_object_dynamic(obj); } static struct kobj_type klp_ktype_object = { @@ -620,6 +570,12 @@ static struct kobj_type klp_ktype_object = { static void klp_kobj_release_func(struct kobject *kobj) { + struct klp_func *func; + + func = container_of(kobj, struct klp_func, kobj); + + if (func->nop) + klp_free_func_nop(func); } static struct kobj_type klp_ktype_func = { @@ -627,17 +583,17 @@ static struct kobj_type klp_ktype_func = { .sysfs_ops = &kobj_sysfs_ops, }; -/* - * Free all functions' kobjects in the array up to some limit. When limit is - * NULL, all kobjects are freed. - */ -static void klp_free_funcs_limited(struct klp_object *obj, - struct klp_func *limit) +static void __klp_free_funcs(struct klp_object *obj, bool nops_only) { - struct klp_func *func; + struct klp_func *func, *tmp_func; + + klp_for_each_func_safe(obj, func, tmp_func) { + if (nops_only && !func->nop) + continue; - for (func = obj->funcs; func->old_name && func != limit; func++) + list_del(&func->node); kobject_put(&func->kobj); + } } /* Clean up when a patched object is unloaded */ @@ -647,35 +603,101 @@ static void klp_free_object_loaded(struct klp_object *obj) obj->mod = NULL; - klp_for_each_func(obj, func) - func->old_addr = 0; + klp_for_each_func(obj, func) { + func->old_func = NULL; + + if (func->nop) + func->new_func = NULL; + } } -/* - * Free all objects' kobjects in the array up to some limit. When limit is - * NULL, all kobjects are freed. - */ -static void klp_free_objects_limited(struct klp_patch *patch, - struct klp_object *limit) +static void __klp_free_objects(struct klp_patch *patch, bool nops_only) { - struct klp_object *obj; + struct klp_object *obj, *tmp_obj; + + klp_for_each_object_safe(patch, obj, tmp_obj) { + __klp_free_funcs(obj, nops_only); - for (obj = patch->objs; obj->funcs && obj != limit; obj++) { - klp_free_funcs_limited(obj, NULL); + if (nops_only && !obj->dynamic) + continue; + + list_del(&obj->node); kobject_put(&obj->kobj); } } -static void klp_free_patch(struct klp_patch *patch) +static void klp_free_objects(struct klp_patch *patch) +{ + __klp_free_objects(patch, false); +} + +static void klp_free_objects_dynamic(struct klp_patch *patch) +{ + __klp_free_objects(patch, true); +} + +/* + * This function implements the free operations that can be called safely + * under klp_mutex. + * + * The operation must be completed by calling klp_free_patch_finish() + * outside klp_mutex. + */ +void klp_free_patch_start(struct klp_patch *patch) { - klp_free_objects_limited(patch, NULL); if (!list_empty(&patch->list)) list_del(&patch->list); + + klp_free_objects(patch); +} + +/* + * This function implements the free part that must be called outside + * klp_mutex. + * + * It must be called after klp_free_patch_start(). And it has to be + * the last function accessing the livepatch structures when the patch + * gets disabled. + */ +static void klp_free_patch_finish(struct klp_patch *patch) +{ + /* + * Avoid deadlock with enabled_store() sysfs callback by + * calling this outside klp_mutex. It is safe because + * this is called when the patch gets disabled and it + * cannot get enabled again. + */ + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); + + /* Put the module after the last access to struct klp_patch. */ + if (!patch->forced) + module_put(patch->mod); +} + +/* + * The livepatch might be freed from sysfs interface created by the patch. + * This work allows to wait until the interface is destroyed in a separate + * context. + */ +static void klp_free_patch_work_fn(struct work_struct *work) +{ + struct klp_patch *patch = + container_of(work, struct klp_patch, free_work); + + klp_free_patch_finish(patch); } static int klp_init_func(struct klp_object *obj, struct klp_func *func) { - if (!func->old_name || !func->new_func) + if (!func->old_name) + return -EINVAL; + + /* + * NOPs get the address later. The patched module must be loaded, + * see klp_init_object_loaded(). + */ + if (!func->new_func && !func->nop) return -EINVAL; if (strlen(func->old_name) >= KSYM_NAME_LEN) @@ -690,9 +712,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) * object. If the user selects 0 for old_sympos, then 1 will be used * since a unique symbol will be the first occurrence. */ - return kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s,%lu", func->old_name, - func->old_sympos ? func->old_sympos : 1); + return kobject_add(&func->kobj, &obj->kobj, "%s,%lu", + func->old_name, + func->old_sympos ? func->old_sympos : 1); } /* Arches may override this to finish any remaining arch-specific tasks */ @@ -721,11 +743,11 @@ static int klp_init_object_loaded(struct klp_patch *patch, klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, func->old_sympos, - &func->old_addr); + (unsigned long *)&func->old_func); if (ret) return ret; - ret = kallsyms_lookup_size_offset(func->old_addr, + ret = kallsyms_lookup_size_offset((unsigned long)func->old_func, &func->old_size, NULL); if (!ret) { pr_err("kallsyms size lookup failed for '%s'\n", @@ -733,6 +755,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, return -ENOENT; } + if (func->nop) + func->new_func = func->old_func; + ret = kallsyms_lookup_size_offset((unsigned long)func->new_func, &func->new_size, NULL); if (!ret) { @@ -751,9 +776,6 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) int ret; const char *name; - if (!obj->funcs) - return -EINVAL; - if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN) return -EINVAL; @@ -763,126 +785,200 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) klp_find_object_module(obj); name = klp_is_module(obj) ? obj->name : "vmlinux"; - ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, - &patch->kobj, "%s", name); + ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name); if (ret) return ret; klp_for_each_func(obj, func) { ret = klp_init_func(obj, func); if (ret) - goto free; + return ret; } - if (klp_is_object_loaded(obj)) { + if (klp_is_object_loaded(obj)) ret = klp_init_object_loaded(patch, obj); - if (ret) - goto free; - } - return 0; - -free: - klp_free_funcs_limited(obj, func); - kobject_put(&obj->kobj); return ret; } -static int klp_init_patch(struct klp_patch *patch) +static void klp_init_func_early(struct klp_object *obj, + struct klp_func *func) +{ + kobject_init(&func->kobj, &klp_ktype_func); + list_add_tail(&func->node, &obj->func_list); +} + +static void klp_init_object_early(struct klp_patch *patch, + struct klp_object *obj) +{ + INIT_LIST_HEAD(&obj->func_list); + kobject_init(&obj->kobj, &klp_ktype_object); + list_add_tail(&obj->node, &patch->obj_list); +} + +static int klp_init_patch_early(struct klp_patch *patch) { struct klp_object *obj; - int ret; + struct klp_func *func; if (!patch->objs) return -EINVAL; - mutex_lock(&klp_mutex); - + INIT_LIST_HEAD(&patch->list); + INIT_LIST_HEAD(&patch->obj_list); + kobject_init(&patch->kobj, &klp_ktype_patch); patch->enabled = false; + patch->forced = false; + INIT_WORK(&patch->free_work, klp_free_patch_work_fn); init_completion(&patch->finish); - ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, - klp_root_kobj, "%s", patch->mod->name); - if (ret) { - mutex_unlock(&klp_mutex); + klp_for_each_object_static(patch, obj) { + if (!obj->funcs) + return -EINVAL; + + klp_init_object_early(patch, obj); + + klp_for_each_func_static(obj, func) { + klp_init_func_early(obj, func); + } + } + + if (!try_module_get(patch->mod)) + return -ENODEV; + + return 0; +} + +static int klp_init_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + + ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name); + if (ret) return ret; + + if (patch->replace) { + ret = klp_add_nops(patch); + if (ret) + return ret; } klp_for_each_object(patch, obj) { ret = klp_init_object(patch, obj); if (ret) - goto free; + return ret; } list_add_tail(&patch->list, &klp_patches); - mutex_unlock(&klp_mutex); - return 0; +} -free: - klp_free_objects_limited(patch, obj); +static int __klp_disable_patch(struct klp_patch *patch) +{ + struct klp_object *obj; - mutex_unlock(&klp_mutex); + if (WARN_ON(!patch->enabled)) + return -EINVAL; - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); + if (klp_transition_patch) + return -EBUSY; - return ret; + klp_init_transition(patch, KLP_UNPATCHED); + + klp_for_each_object(patch, obj) + if (obj->patched) + klp_pre_unpatch_callback(obj); + + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the TIF_PATCH_PENDING writes in + * klp_start_transition(). In the rare case where klp_ftrace_handler() + * is called shortly after klp_update_patch_state() switches the task, + * this ensures the handler sees that func->transition is set. + */ + smp_wmb(); + + klp_start_transition(); + patch->enabled = false; + klp_try_complete_transition(); + + return 0; } -/** - * klp_unregister_patch() - unregisters a patch - * @patch: Disabled patch to be unregistered - * - * Frees the data structures and removes the sysfs interface. - * - * Return: 0 on success, otherwise error - */ -int klp_unregister_patch(struct klp_patch *patch) +static int __klp_enable_patch(struct klp_patch *patch) { + struct klp_object *obj; int ret; - mutex_lock(&klp_mutex); + if (klp_transition_patch) + return -EBUSY; - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } + if (WARN_ON(patch->enabled)) + return -EINVAL; - if (patch->enabled) { - ret = -EBUSY; - goto err; - } + pr_notice("enabling patch '%s'\n", patch->mod->name); - klp_free_patch(patch); + klp_init_transition(patch, KLP_PATCHED); - mutex_unlock(&klp_mutex); + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the ops->func_stack writes in + * klp_patch_object(), so that klp_ftrace_handler() will see the + * func->transition updates before the handler is registered and the + * new funcs become visible to the handler. + */ + smp_wmb(); - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); + klp_for_each_object(patch, obj) { + if (!klp_is_object_loaded(obj)) + continue; + + ret = klp_pre_patch_callback(obj); + if (ret) { + pr_warn("pre-patch callback failed for object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } + + ret = klp_patch_object(obj); + if (ret) { + pr_warn("failed to patch object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } + } + + klp_start_transition(); + patch->enabled = true; + klp_try_complete_transition(); return 0; err: - mutex_unlock(&klp_mutex); + pr_warn("failed to enable patch '%s'\n", patch->mod->name); + + klp_cancel_transition(); return ret; } -EXPORT_SYMBOL_GPL(klp_unregister_patch); /** - * klp_register_patch() - registers a patch - * @patch: Patch to be registered + * klp_enable_patch() - enable the livepatch + * @patch: patch to be enabled * - * Initializes the data structure associated with the patch and - * creates the sysfs interface. + * Initializes the data structure associated with the patch, creates the sysfs + * interface, performs the needed symbol lookups and code relocations, + * registers the patched functions with ftrace. * - * There is no need to take the reference on the patch module here. It is done - * later when the patch is enabled. + * This function is supposed to be called from the livepatch module_init() + * callback. * * Return: 0 on success, otherwise error */ -int klp_register_patch(struct klp_patch *patch) +int klp_enable_patch(struct klp_patch *patch) { + int ret; + if (!patch || !patch->mod) return -EINVAL; @@ -896,13 +992,91 @@ int klp_register_patch(struct klp_patch *patch) return -ENODEV; if (!klp_have_reliable_stack()) { - pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); - return -ENOSYS; + pr_warn("This architecture doesn't have support for the livepatch consistency model.\n"); + pr_warn("The livepatch transition may never complete.\n"); + } + + mutex_lock(&klp_mutex); + + ret = klp_init_patch_early(patch); + if (ret) { + mutex_unlock(&klp_mutex); + return ret; } - return klp_init_patch(patch); + ret = klp_init_patch(patch); + if (ret) + goto err; + + ret = __klp_enable_patch(patch); + if (ret) + goto err; + + mutex_unlock(&klp_mutex); + + return 0; + +err: + klp_free_patch_start(patch); + + mutex_unlock(&klp_mutex); + + klp_free_patch_finish(patch); + + return ret; +} +EXPORT_SYMBOL_GPL(klp_enable_patch); + +/* + * This function removes replaced patches. + * + * We could be pretty aggressive here. It is called in the situation where + * these structures are no longer accessible. All functions are redirected + * by the klp_transition_patch. They use either a new code or they are in + * the original code because of the special nop function patches. + * + * The only exception is when the transition was forced. In this case, + * klp_ftrace_handler() might still see the replaced patch on the stack. + * Fortunately, it is carefully designed to work with removed functions + * thanks to RCU. We only have to keep the patches on the system. Also + * this is handled transparently by patch->module_put. + */ +void klp_discard_replaced_patches(struct klp_patch *new_patch) +{ + struct klp_patch *old_patch, *tmp_patch; + + klp_for_each_patch_safe(old_patch, tmp_patch) { + if (old_patch == new_patch) + return; + + old_patch->enabled = false; + klp_unpatch_objects(old_patch); + klp_free_patch_start(old_patch); + schedule_work(&old_patch->free_work); + } +} + +/* + * This function removes the dynamically allocated 'nop' functions. + * + * We could be pretty aggressive. NOPs do not change the existing + * behavior except for adding unnecessary delay by the ftrace handler. + * + * It is safe even when the transition was forced. The ftrace handler + * will see a valid ops->func_stack entry thanks to RCU. + * + * We could even free the NOPs structures. They must be the last entry + * in ops->func_stack. Therefore unregister_ftrace_function() is called. + * It does the same as klp_synchronize_transition() to make sure that + * nobody is inside the ftrace handler once the operation finishes. + * + * IMPORTANT: It must be called right after removing the replaced patches! + */ +void klp_discard_nops(struct klp_patch *new_patch) +{ + klp_unpatch_objects_dynamic(klp_transition_patch); + klp_free_objects_dynamic(klp_transition_patch); } -EXPORT_SYMBOL_GPL(klp_register_patch); /* * Remove parts of patches that touch a given kernel module. The list of @@ -915,7 +1089,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod, struct klp_patch *patch; struct klp_object *obj; - list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_patch(patch) { if (patch == limit) break; @@ -923,21 +1097,14 @@ static void klp_cleanup_module_patches_limited(struct module *mod, if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - /* - * Only unpatch the module if the patch is enabled or - * is in transition. - */ - if (patch->enabled || patch == klp_transition_patch) { + if (patch != klp_transition_patch) + klp_pre_unpatch_callback(obj); - if (patch != klp_transition_patch) - klp_pre_unpatch_callback(obj); + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_unpatch_object(obj); - pr_notice("reverting patch '%s' on unloading module '%s'\n", - patch->mod->name, obj->mod->name); - klp_unpatch_object(obj); - - klp_post_unpatch_callback(obj); - } + klp_post_unpatch_callback(obj); klp_free_object_loaded(obj); break; @@ -962,7 +1129,7 @@ int klp_module_coming(struct module *mod) */ mod->klp_alive = true; - list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_patch(patch) { klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; @@ -976,13 +1143,6 @@ int klp_module_coming(struct module *mod) goto err; } - /* - * Only patch the module if the patch is enabled or is - * in transition. - */ - if (!patch->enabled && patch != klp_transition_patch) - break; - pr_notice("applying patch '%s' to loading module '%s'\n", patch->mod->name, obj->mod->name); diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index 48a83d4364cf..ec43a40b853f 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -5,6 +5,17 @@ #include <linux/livepatch.h> extern struct mutex klp_mutex; +extern struct list_head klp_patches; + +#define klp_for_each_patch_safe(patch, tmp_patch) \ + list_for_each_entry_safe(patch, tmp_patch, &klp_patches, list) + +#define klp_for_each_patch(patch) \ + list_for_each_entry(patch, &klp_patches, list) + +void klp_free_patch_start(struct klp_patch *patch); +void klp_discard_replaced_patches(struct klp_patch *new_patch); +void klp_discard_nops(struct klp_patch *new_patch); static inline bool klp_is_object_loaded(struct klp_object *obj) { diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 7702cb4064fc..99cb3ad05eb4 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -34,7 +34,7 @@ static LIST_HEAD(klp_ops); -struct klp_ops *klp_find_ops(unsigned long old_addr) +struct klp_ops *klp_find_ops(void *old_func) { struct klp_ops *ops; struct klp_func *func; @@ -42,7 +42,7 @@ struct klp_ops *klp_find_ops(unsigned long old_addr) list_for_each_entry(ops, &klp_ops, node) { func = list_first_entry(&ops->func_stack, struct klp_func, stack_node); - if (func->old_addr == old_addr) + if (func->old_func == old_func) return ops; } @@ -118,7 +118,15 @@ static void notrace klp_ftrace_handler(unsigned long ip, } } + /* + * NOPs are used to replace existing patches with original code. + * Do nothing! Setting pc would cause an infinite loop. + */ + if (func->nop) + goto unlock; + klp_arch_set_pc(regs, (unsigned long)func->new_func); + unlock: preempt_enable_notrace(); } @@ -142,17 +150,18 @@ static void klp_unpatch_func(struct klp_func *func) if (WARN_ON(!func->patched)) return; - if (WARN_ON(!func->old_addr)) + if (WARN_ON(!func->old_func)) return; - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (WARN_ON(!ops)) return; if (list_is_singular(&ops->func_stack)) { unsigned long ftrace_loc; - ftrace_loc = klp_get_ftrace_location(func->old_addr); + ftrace_loc = + klp_get_ftrace_location((unsigned long)func->old_func); if (WARN_ON(!ftrace_loc)) return; @@ -174,17 +183,18 @@ static int klp_patch_func(struct klp_func *func) struct klp_ops *ops; int ret; - if (WARN_ON(!func->old_addr)) + if (WARN_ON(!func->old_func)) return -EINVAL; if (WARN_ON(func->patched)) return -EINVAL; - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (!ops) { unsigned long ftrace_loc; - ftrace_loc = klp_get_ftrace_location(func->old_addr); + ftrace_loc = + klp_get_ftrace_location((unsigned long)func->old_func); if (!ftrace_loc) { pr_err("failed to find location for function '%s'\n", func->old_name); @@ -236,15 +246,26 @@ err: return ret; } -void klp_unpatch_object(struct klp_object *obj) +static void __klp_unpatch_object(struct klp_object *obj, bool nops_only) { struct klp_func *func; - klp_for_each_func(obj, func) + klp_for_each_func(obj, func) { + if (nops_only && !func->nop) + continue; + if (func->patched) klp_unpatch_func(func); + } - obj->patched = false; + if (obj->dynamic || !nops_only) + obj->patched = false; +} + + +void klp_unpatch_object(struct klp_object *obj) +{ + __klp_unpatch_object(obj, false); } int klp_patch_object(struct klp_object *obj) @@ -267,11 +288,21 @@ int klp_patch_object(struct klp_object *obj) return 0; } -void klp_unpatch_objects(struct klp_patch *patch) +static void __klp_unpatch_objects(struct klp_patch *patch, bool nops_only) { struct klp_object *obj; klp_for_each_object(patch, obj) if (obj->patched) - klp_unpatch_object(obj); + __klp_unpatch_object(obj, nops_only); +} + +void klp_unpatch_objects(struct klp_patch *patch) +{ + __klp_unpatch_objects(patch, false); +} + +void klp_unpatch_objects_dynamic(struct klp_patch *patch) +{ + __klp_unpatch_objects(patch, true); } diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h index e72d8250d04b..d5f2fbe373e0 100644 --- a/kernel/livepatch/patch.h +++ b/kernel/livepatch/patch.h @@ -10,7 +10,7 @@ * struct klp_ops - structure for tracking registered ftrace ops structs * * A single ftrace_ops is shared between all enabled replacement functions - * (klp_func structs) which have the same old_addr. This allows the switch + * (klp_func structs) which have the same old_func. This allows the switch * between function versions to happen instantaneously by updating the klp_ops * struct's func_stack list. The winner is the klp_func at the top of the * func_stack (front of the list). @@ -25,10 +25,11 @@ struct klp_ops { struct ftrace_ops fops; }; -struct klp_ops *klp_find_ops(unsigned long old_addr); +struct klp_ops *klp_find_ops(void *old_func); int klp_patch_object(struct klp_object *obj); void klp_unpatch_object(struct klp_object *obj); void klp_unpatch_objects(struct klp_patch *patch); +void klp_unpatch_objects_dynamic(struct klp_patch *patch); #endif /* _LIVEPATCH_PATCH_H */ diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 304d5eb8a98c..c53370d596be 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -29,11 +29,13 @@ #define MAX_STACK_ENTRIES 100 #define STACK_ERR_BUF_SIZE 128 +#define SIGNALS_TIMEOUT 15 + struct klp_patch *klp_transition_patch; static int klp_target_state = KLP_UNDEFINED; -static bool klp_forced = false; +static unsigned int klp_signals_cnt; /* * This work can be performed periodically to finish patching or unpatching any @@ -87,6 +89,11 @@ static void klp_complete_transition(void) klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) { + klp_discard_replaced_patches(klp_transition_patch); + klp_discard_nops(klp_transition_patch); + } + if (klp_target_state == KLP_UNPATCHED) { /* * All tasks have transitioned to KLP_UNPATCHED so we can now @@ -136,13 +143,6 @@ static void klp_complete_transition(void) pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* - * klp_forced set implies unbounded increase of module's ref count if - * the module is disabled/enabled in a loop. - */ - if (!klp_forced && klp_target_state == KLP_UNPATCHED) - module_put(klp_transition_patch->mod); - klp_target_state = KLP_UNDEFINED; klp_transition_patch = NULL; } @@ -202,15 +202,15 @@ void klp_update_patch_state(struct task_struct *task) * Determine whether the given stack trace includes any references to a * to-be-patched or to-be-unpatched function. */ -static int klp_check_stack_func(struct klp_func *func, - struct stack_trace *trace) +static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, + unsigned int nr_entries) { unsigned long func_addr, func_size, address; struct klp_ops *ops; int i; - for (i = 0; i < trace->nr_entries; i++) { - address = trace->entries[i]; + for (i = 0; i < nr_entries; i++) { + address = entries[i]; if (klp_target_state == KLP_UNPATCHED) { /* @@ -224,11 +224,11 @@ static int klp_check_stack_func(struct klp_func *func, * Check for the to-be-patched function * (the previous func). */ - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (list_is_singular(&ops->func_stack)) { /* original function */ - func_addr = func->old_addr; + func_addr = (unsigned long)func->old_func; func_size = func->old_size; } else { /* previously patched function */ @@ -254,29 +254,25 @@ static int klp_check_stack_func(struct klp_func *func, static int klp_check_stack(struct task_struct *task, char *err_buf) { static unsigned long entries[MAX_STACK_ENTRIES]; - struct stack_trace trace; struct klp_object *obj; struct klp_func *func; - int ret; + int ret, nr_entries; - trace.skip = 0; - trace.nr_entries = 0; - trace.max_entries = MAX_STACK_ENTRIES; - trace.entries = entries; - ret = save_stack_trace_tsk_reliable(task, &trace); + ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); WARN_ON_ONCE(ret == -ENOSYS); - if (ret) { + if (ret < 0) { snprintf(err_buf, STACK_ERR_BUF_SIZE, "%s: %s:%d has an unreliable stack\n", __func__, task->comm, task->pid); return ret; } + nr_entries = ret; klp_for_each_object(klp_transition_patch, obj) { if (!obj->patched) continue; klp_for_each_func(obj, func) { - ret = klp_check_stack_func(func, &trace); + ret = klp_check_stack_func(func, entries, nr_entries); if (ret) { snprintf(err_buf, STACK_ERR_BUF_SIZE, "%s: %s:%d is sleeping on function %s\n", @@ -348,6 +344,47 @@ done: } /* + * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. + * Kthreads with TIF_PATCH_PENDING set are woken up. + */ +static void klp_send_signals(void) +{ + struct task_struct *g, *task; + + if (klp_signals_cnt == SIGNALS_TIMEOUT) + pr_notice("signaling remaining tasks\n"); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + if (!klp_patch_pending(task)) + continue; + + /* + * There is a small race here. We could see TIF_PATCH_PENDING + * set and decide to wake up a kthread or send a fake signal. + * Meanwhile the task could migrate itself and the action + * would be meaningless. It is not serious though. + */ + if (task->flags & PF_KTHREAD) { + /* + * Wake up a kthread which sleeps interruptedly and + * still has not been migrated. + */ + wake_up_state(task, TASK_INTERRUPTIBLE); + } else { + /* + * Send fake signal to all non-kthread tasks which are + * still not migrated. + */ + spin_lock_irq(&task->sighand->siglock); + signal_wake_up(task, 0); + spin_unlock_irq(&task->sighand->siglock); + } + } + read_unlock(&tasklist_lock); +} + +/* * Try to switch all remaining tasks to the target patch state by walking the * stacks of sleeping tasks and looking for any to-be-patched or * to-be-unpatched functions. If such functions are found, the task can't be @@ -359,6 +396,7 @@ void klp_try_complete_transition(void) { unsigned int cpu; struct task_struct *g, *task; + struct klp_patch *patch; bool complete = true; WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); @@ -396,6 +434,10 @@ void klp_try_complete_transition(void) put_online_cpus(); if (!complete) { + if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT)) + klp_send_signals(); + klp_signals_cnt++; + /* * Some tasks weren't able to be switched over. Try again * later and/or wait for other methods like kernel exit @@ -407,7 +449,18 @@ void klp_try_complete_transition(void) } /* we're done, now cleanup the data structures */ + patch = klp_transition_patch; klp_complete_transition(); + + /* + * It would make more sense to free the patch in + * klp_complete_transition() but it is called also + * from klp_cancel_transition(). + */ + if (!patch->enabled) { + klp_free_patch_start(patch); + schedule_work(&patch->free_work); + } } /* @@ -446,6 +499,8 @@ void klp_start_transition(void) if (task->patch_state != klp_target_state) set_tsk_thread_flag(task, TIF_PATCH_PENDING); } + + klp_signals_cnt = 0; } /* @@ -569,47 +624,6 @@ void klp_copy_process(struct task_struct *child) } /* - * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. - * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this - * action currently. - */ -void klp_send_signals(void) -{ - struct task_struct *g, *task; - - pr_notice("signaling remaining tasks\n"); - - read_lock(&tasklist_lock); - for_each_process_thread(g, task) { - if (!klp_patch_pending(task)) - continue; - - /* - * There is a small race here. We could see TIF_PATCH_PENDING - * set and decide to wake up a kthread or send a fake signal. - * Meanwhile the task could migrate itself and the action - * would be meaningless. It is not serious though. - */ - if (task->flags & PF_KTHREAD) { - /* - * Wake up a kthread which sleeps interruptedly and - * still has not been migrated. - */ - wake_up_state(task, TASK_INTERRUPTIBLE); - } else { - /* - * Send fake signal to all non-kthread tasks which are - * still not migrated. - */ - spin_lock_irq(&task->sighand->siglock); - signal_wake_up(task, 0); - spin_unlock_irq(&task->sighand->siglock); - } - } - read_unlock(&tasklist_lock); -} - -/* * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an * existing transition to finish. * @@ -620,6 +634,7 @@ void klp_send_signals(void) */ void klp_force_transition(void) { + struct klp_patch *patch; struct task_struct *g, *task; unsigned int cpu; @@ -633,5 +648,6 @@ void klp_force_transition(void) for_each_possible_cpu(cpu) klp_update_patch_state(idle_task(cpu)); - klp_forced = true; + klp_for_each_patch(patch) + patch->forced = true; } diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index f9d0bc016067..322db16233de 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -11,7 +11,6 @@ void klp_cancel_transition(void); void klp_start_transition(void); void klp_try_complete_transition(void); void klp_reverse_transition(void); -void klp_send_signals(void); void klp_force_transition(void); #endif /* _LIVEPATCH_TRANSITION_H */ diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 392c7f23af76..6fe2f333aecb 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,7 +3,7 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o +obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c new file mode 100644 index 000000000000..fa2c2f951c6b --- /dev/null +++ b/kernel/locking/lock_events.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <waiman.long@hpe.com> + */ + +/* + * Collect locking event counts + */ +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/fs.h> + +#include "lock_events.h" + +#undef LOCK_EVENT +#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name, + +#define LOCK_EVENTS_DIR "lock_event_counts" + +/* + * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different + * types of locks will be reported under the <debugfs>/lock_event_counts/ + * directory. See lock_events_list.h for the list of available locking + * events. + * + * Writing to the special ".reset_counts" file will reset all the above + * locking event counts. This is a very slow operation and so should not + * be done frequently. + * + * These event counts are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counts usable even in a production + * environment. + */ +static const char * const lockevent_names[lockevent_num + 1] = { + +#include "lock_events_list.h" + + [LOCKEVENT_reset_cnts] = ".reset_counts", +}; + +/* + * Per-cpu counts + */ +DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * The lockevent_read() function can be overridden. + */ +ssize_t __weak lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[64]; + int cpu, id, len; + u64 sum = 0; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + id = (long)file_inode(file)->i_private; + + if (id >= lockevent_num) + return -EBADF; + + for_each_possible_cpu(cpu) + sum += per_cpu(lockevents[id], cpu); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When idx = reset_cnts, reset all the counts. + */ +static ssize_t lockevent_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int cpu; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) + return count; + + for_each_possible_cpu(cpu) { + int i; + unsigned long *ptr = per_cpu_ptr(lockevents, cpu); + + for (i = 0 ; i < lockevent_num; i++) + WRITE_ONCE(ptr[i], 0); + } + return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_lockevent = { + .read = lockevent_read, + .write = lockevent_write, + .llseek = default_llseek, +}; + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#include <asm/paravirt.h> + +static bool __init skip_lockevent(const char *name) +{ + static int pv_on __initdata = -1; + + if (pv_on < 0) + pv_on = !pv_is_native_spin_unlock(); + /* + * Skip PV qspinlock events on bare metal. + */ + if (!pv_on && !memcmp(name, "pv_", 3)) + return true; + return false; +} +#else +static inline bool skip_lockevent(const char *name) +{ + return false; +} +#endif + +/* + * Initialize debugfs for the locking event counts. + */ +static int __init init_lockevent_counts(void) +{ + struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); + int i; + + if (!d_counts) + goto out; + + /* + * Create the debugfs files + * + * As reading from and writing to the stat files can be slow, only + * root is allowed to do the read/write to limit impact to system + * performance. + */ + for (i = 0; i < lockevent_num; i++) { + if (skip_lockevent(lockevent_names[i])) + continue; + if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent)) + goto fail_undo; + } + + if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + d_counts, (void *)(long)LOCKEVENT_reset_cnts, + &fops_lockevent)) + goto fail_undo; + + return 0; +fail_undo: + debugfs_remove_recursive(d_counts); +out: + pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR); + return -ENOMEM; +} +fs_initcall(init_lockevent_counts); diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h new file mode 100644 index 000000000000..feb1acc54611 --- /dev/null +++ b/kernel/locking/lock_events.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <longman@redhat.com> + */ + +#ifndef __LOCKING_LOCK_EVENTS_H +#define __LOCKING_LOCK_EVENTS_H + +enum lock_events { + +#include "lock_events_list.h" + + lockevent_num, /* Total number of lock event counts */ + LOCKEVENT_reset_cnts = lockevent_num, +}; + +#ifdef CONFIG_LOCK_EVENT_COUNTS +/* + * Per-cpu counters + */ +DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * Increment the PV qspinlock statistical counters + */ +static inline void __lockevent_inc(enum lock_events event, bool cond) +{ + if (cond) + __this_cpu_inc(lockevents[event]); +} + +#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) +#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c) + +static inline void __lockevent_add(enum lock_events event, int inc) +{ + __this_cpu_add(lockevents[event], inc); +} + +#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) + +#else /* CONFIG_LOCK_EVENT_COUNTS */ + +#define lockevent_inc(ev) +#define lockevent_add(ev, c) +#define lockevent_cond_inc(ev, c) + +#endif /* CONFIG_LOCK_EVENT_COUNTS */ +#endif /* __LOCKING_LOCK_EVENTS_H */ diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h new file mode 100644 index 000000000000..ad7668cfc9da --- /dev/null +++ b/kernel/locking/lock_events_list.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <longman@redhat.com> + */ + +#ifndef LOCK_EVENT +#define LOCK_EVENT(name) LOCKEVENT_ ## name, +#endif + +#ifdef CONFIG_QUEUED_SPINLOCKS +#ifdef CONFIG_PARAVIRT_SPINLOCKS +/* + * Locking events for PV qspinlock. + */ +LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */ +LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */ +LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */ +LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */ +LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */ +LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */ +LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */ +LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */ +LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */ +LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */ +LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +/* + * Locking events for qspinlock + * + * Subtracting lock_use_node[234] from lock_slowpath will give you + * lock_use_node1. + */ +LOCK_EVENT(lock_pending) /* # of locking ops via pending code */ +LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */ +LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */ +LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */ +LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */ +LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +/* + * Locking events for rwsem + */ +LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ +LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ +LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ +LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ +LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ +LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ +LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ +LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ +LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ +LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */ +LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ +LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ +LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 95932333a48b..d06190fa5082 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -45,11 +45,14 @@ #include <linux/hash.h> #include <linux/ftrace.h> #include <linux/stringify.h> +#include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/nmi.h> +#include <linux/rcupdate.h> +#include <linux/kprobes.h> #include <asm/sections.h> @@ -81,6 +84,7 @@ module_param(lock_stat, int, 0644); * code to recurse back into the lockdep code... */ static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { @@ -130,13 +134,17 @@ static inline int debug_locks_off_graph_unlock(void) unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); /* * All data structures here are protected by the global debug_lock. * - * Mutex key structs only get allocated, once during bootup, and never - * get freed - this significantly simplifies the debugging code. + * nr_lock_classes is the number of elements of lock_classes[] that is + * in use. */ +#define KEYHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) +#define KEYHASH_SIZE (1UL << KEYHASH_BITS) +static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; #ifndef CONFIG_DEBUG_LOCKDEP static @@ -277,11 +285,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock) #endif /* - * We keep a global list of all lock classes. The list only grows, - * never shrinks. The list is only accessed with the lockdep - * spinlock lock held. + * We keep a global list of all lock classes. The list is only accessed with + * the lockdep spinlock lock held. free_lock_classes is a list with free + * elements. These elements are linked together by the lock_entry member in + * struct lock_class. */ LIST_HEAD(all_lock_classes); +static LIST_HEAD(free_lock_classes); + +/** + * struct pending_free - information about data structures about to be freed + * @zapped: Head of a list with struct lock_class elements. + * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements + * are about to be freed. + */ +struct pending_free { + struct list_head zapped; + DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS); +}; + +/** + * struct delayed_free - data structures used for delayed freeing + * + * A data structure for delayed freeing of data structures that may be + * accessed by RCU readers at the time these were freed. + * + * @rcu_head: Used to schedule an RCU callback for freeing data structures. + * @index: Index of @pf to which freed data structures are added. + * @scheduled: Whether or not an RCU callback has been scheduled. + * @pf: Array with information about data structures about to be freed. + */ +static struct delayed_free { + struct rcu_head rcu_head; + int index; + int scheduled; + struct pending_free pf[2]; +} delayed_free; /* * The lockdep classes are in a hash-table as well, for fast lookup: @@ -331,6 +370,11 @@ void lockdep_on(void) } EXPORT_SYMBOL(lockdep_on); +void lockdep_set_selftest_task(struct task_struct *task) +{ + lockdep_selftest_task_struct = task; +} + /* * Debugging switches: */ @@ -390,29 +434,14 @@ static void print_lockdep_off(const char *bug_msg) #endif } -static int save_trace(struct stack_trace *trace) +static int save_trace(struct lock_trace *trace) { - trace->nr_entries = 0; - trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->entries = stack_trace + nr_stack_trace_entries; - - trace->skip = 3; - - save_stack_trace(trace); - - /* - * Some daft arches put -1 at the end to indicate its a full trace. - * - * <rant> this is buggy anyway, since it takes a whole extra entry so a - * complete trace that maxes out the entries provided will be reported - * as incomplete, friggin useless </rant> - */ - if (trace->nr_entries != 0 && - trace->entries[trace->nr_entries-1] == ULONG_MAX) - trace->nr_entries--; - - trace->max_entries = trace->nr_entries; + unsigned long *entries = stack_trace + nr_stack_trace_entries; + unsigned int max_entries; + trace->offset = nr_stack_trace_entries; + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + trace->nr_entries = stack_trace_save(entries, max_entries, 3); nr_stack_trace_entries += trace->nr_entries; if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { @@ -472,11 +501,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) { char c = '.'; - if (class->usage_mask & lock_flag(bit + 2)) + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) c = '+'; if (class->usage_mask & lock_flag(bit)) { c = '-'; - if (class->usage_mask & lock_flag(bit + 2)) + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) c = '?'; } @@ -599,12 +628,15 @@ static int very_verbose(struct lock_class *class) * Is this the address of a static object: */ #ifdef __KERNEL__ -static int static_obj(void *obj) +static int static_obj(const void *obj) { unsigned long start = (unsigned long) &_stext, end = (unsigned long) &_end, addr = (unsigned long) obj; + if (arch_is_kernel_initmem_freed(addr)) + return 0; + /* * static variable? */ @@ -716,6 +748,17 @@ static bool assign_lock_key(struct lockdep_map *lock) { unsigned long can_addr, addr = (unsigned long)lock; +#ifdef __KERNEL__ + /* + * lockdep_free_key_range() assumes that struct lock_class_key + * objects do not overlap. Since we use the address of lock + * objects as class key for static objects, check whether the + * size of lock_class_key objects does not exceed the size of + * the smallest lock object. + */ + BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t)); +#endif + if (__is_kernel_percpu_address(addr, &can_addr)) lock->key = (void *)can_addr; else if (__is_module_percpu_address(addr, &can_addr)) @@ -735,6 +778,289 @@ static bool assign_lock_key(struct lockdep_map *lock) return true; } +#ifdef CONFIG_DEBUG_LOCKDEP + +/* Check whether element @e occurs in list @h */ +static bool in_list(struct list_head *e, struct list_head *h) +{ + struct list_head *f; + + list_for_each(f, h) { + if (e == f) + return true; + } + + return false; +} + +/* + * Check whether entry @e occurs in any of the locks_after or locks_before + * lists. + */ +static bool in_any_class_list(struct list_head *e) +{ + struct lock_class *class; + int i; + + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (in_list(e, &class->locks_after) || + in_list(e, &class->locks_before)) + return true; + } + return false; +} + +static bool class_lock_list_valid(struct lock_class *c, struct list_head *h) +{ + struct lock_list *e; + + list_for_each_entry(e, h, entry) { + if (e->links_to != c) { + printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s", + c->name ? : "(?)", + (unsigned long)(e - list_entries), + e->links_to && e->links_to->name ? + e->links_to->name : "(?)", + e->class && e->class->name ? e->class->name : + "(?)"); + return false; + } + } + return true; +} + +#ifdef CONFIG_PROVE_LOCKING +static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; +#endif + +static bool check_lock_chain_key(struct lock_chain *chain) +{ +#ifdef CONFIG_PROVE_LOCKING + u64 chain_key = 0; + int i; + + for (i = chain->base; i < chain->base + chain->depth; i++) + chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + /* + * The 'unsigned long long' casts avoid that a compiler warning + * is reported when building tools/lib/lockdep. + */ + if (chain->chain_key != chain_key) { + printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n", + (unsigned long long)(chain - lock_chains), + (unsigned long long)chain->chain_key, + (unsigned long long)chain_key); + return false; + } +#endif + return true; +} + +static bool in_any_zapped_class_list(struct lock_class *class) +{ + struct pending_free *pf; + int i; + + for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) { + if (in_list(&class->lock_entry, &pf->zapped)) + return true; + } + + return false; +} + +static bool __check_data_structures(void) +{ + struct lock_class *class; + struct lock_chain *chain; + struct hlist_head *head; + struct lock_list *e; + int i; + + /* Check whether all classes occur in a lock list. */ + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (!in_list(&class->lock_entry, &all_lock_classes) && + !in_list(&class->lock_entry, &free_lock_classes) && + !in_any_zapped_class_list(class)) { + printk(KERN_INFO "class %px/%s is not in any class list\n", + class, class->name ? : "(?)"); + return false; + } + } + + /* Check whether all classes have valid lock lists. */ + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (!class_lock_list_valid(class, &class->locks_before)) + return false; + if (!class_lock_list_valid(class, &class->locks_after)) + return false; + } + + /* Check the chain_key of all lock chains. */ + for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { + head = chainhash_table + i; + hlist_for_each_entry_rcu(chain, head, entry) { + if (!check_lock_chain_key(chain)) + return false; + } + } + + /* + * Check whether all list entries that are in use occur in a class + * lock list. + */ + for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + e = list_entries + i; + if (!in_any_class_list(&e->entry)) { + printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n", + (unsigned int)(e - list_entries), + e->class->name ? : "(?)", + e->links_to->name ? : "(?)"); + return false; + } + } + + /* + * Check whether all list entries that are not in use do not occur in + * a class lock list. + */ + for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + e = list_entries + i; + if (in_any_class_list(&e->entry)) { + printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n", + (unsigned int)(e - list_entries), + e->class && e->class->name ? e->class->name : + "(?)", + e->links_to && e->links_to->name ? + e->links_to->name : "(?)"); + return false; + } + } + + return true; +} + +int check_consistency = 0; +module_param(check_consistency, int, 0644); + +static void check_data_structures(void) +{ + static bool once = false; + + if (check_consistency && !once) { + if (!__check_data_structures()) { + once = true; + WARN_ON(once); + } + } +} + +#else /* CONFIG_DEBUG_LOCKDEP */ + +static inline void check_data_structures(void) { } + +#endif /* CONFIG_DEBUG_LOCKDEP */ + +/* + * Initialize the lock_classes[] array elements, the free_lock_classes list + * and also the delayed_free structure. + */ +static void init_data_structures_once(void) +{ + static bool ds_initialized, rcu_head_initialized; + int i; + + if (likely(rcu_head_initialized)) + return; + + if (system_state >= SYSTEM_SCHEDULING) { + init_rcu_head(&delayed_free.rcu_head); + rcu_head_initialized = true; + } + + if (ds_initialized) + return; + + ds_initialized = true; + + INIT_LIST_HEAD(&delayed_free.pf[0].zapped); + INIT_LIST_HEAD(&delayed_free.pf[1].zapped); + + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes); + INIT_LIST_HEAD(&lock_classes[i].locks_after); + INIT_LIST_HEAD(&lock_classes[i].locks_before); + } +} + +static inline struct hlist_head *keyhashentry(const struct lock_class_key *key) +{ + unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS); + + return lock_keys_hash + hash; +} + +/* Register a dynamically allocated key. */ +void lockdep_register_key(struct lock_class_key *key) +{ + struct hlist_head *hash_head; + struct lock_class_key *k; + unsigned long flags; + + if (WARN_ON_ONCE(static_obj(key))) + return; + hash_head = keyhashentry(key); + + raw_local_irq_save(flags); + if (!graph_lock()) + goto restore_irqs; + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (WARN_ON_ONCE(k == key)) + goto out_unlock; + } + hlist_add_head_rcu(&key->hash_entry, hash_head); +out_unlock: + graph_unlock(); +restore_irqs: + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_register_key); + +/* Check whether a key has been registered as a dynamic key. */ +static bool is_dynamic_key(const struct lock_class_key *key) +{ + struct hlist_head *hash_head; + struct lock_class_key *k; + bool found = false; + + if (WARN_ON_ONCE(static_obj(key))) + return false; + + /* + * If lock debugging is disabled lock_keys_hash[] may contain + * pointers to memory that has already been freed. Avoid triggering + * a use-after-free in that case by returning early. + */ + if (!debug_locks) + return true; + + hash_head = keyhashentry(key); + + rcu_read_lock(); + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (k == key) { + found = true; + break; + } + } + rcu_read_unlock(); + + return found; +} + /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -756,7 +1082,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (!lock->key) { if (!assign_lock_key(lock)) return NULL; - } else if (!static_obj(lock->key)) { + } else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) { return NULL; } @@ -775,11 +1101,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) goto out_unlock_set; } - /* - * Allocate a new key from the static array, and add it to - * the hash: - */ - if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { + init_data_structures_once(); + + /* Allocate a new lock class and add it to the hash. */ + class = list_first_entry_or_null(&free_lock_classes, typeof(*class), + lock_entry); + if (!class) { if (!debug_locks_off_graph_unlock()) { return NULL; } @@ -788,13 +1115,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) dump_stack(); return NULL; } - class = lock_classes + nr_lock_classes++; + nr_lock_classes++; debug_atomic_inc(nr_unused_locks); class->key = key; class->name = lock->name; class->subclass = subclass; - INIT_LIST_HEAD(&class->locks_before); - INIT_LIST_HEAD(&class->locks_after); + WARN_ON_ONCE(!list_empty(&class->locks_before)); + WARN_ON_ONCE(!list_empty(&class->locks_after)); class->name_version = count_matching_names(class); /* * We use RCU's safe list-add method to make @@ -802,9 +1129,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) */ hlist_add_head_rcu(&class->hash_entry, hash_head); /* - * Add it to the global list of classes: + * Remove the class from the free list and add it to the global list + * of classes. */ - list_add_tail(&class->lock_entry, &all_lock_classes); + list_move_tail(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -845,7 +1173,10 @@ out_set_class_cache: */ static struct lock_list *alloc_list_entry(void) { - if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { + int idx = find_first_zero_bit(list_entries_in_use, + ARRAY_SIZE(list_entries)); + + if (idx >= ARRAY_SIZE(list_entries)) { if (!debug_locks_off_graph_unlock()) return NULL; @@ -853,15 +1184,18 @@ static struct lock_list *alloc_list_entry(void) dump_stack(); return NULL; } - return list_entries + nr_list_entries++; + nr_list_entries++; + __set_bit(idx, list_entries_in_use); + return list_entries + idx; } /* * Add a new dependency to the head of the list: */ -static int add_lock_to_list(struct lock_class *this, struct list_head *head, +static int add_lock_to_list(struct lock_class *this, + struct lock_class *links_to, struct list_head *head, unsigned long ip, int distance, - struct stack_trace *trace) + struct lock_trace *trace) { struct lock_list *entry; /* @@ -873,6 +1207,7 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head, return 0; entry->class = this; + entry->links_to = links_to; entry->distance = distance; entry->trace = *trace; /* @@ -955,7 +1290,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ + WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ lock->parent = parent; lock->class->dep_gen_id = lockdep_dependency_gen_id; } @@ -965,7 +1300,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ + WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1079,6 +1414,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry, * checking. */ +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ + unsigned long *entries = stack_trace + trace->offset; + + stack_trace_print(entries, trace->nr_entries, spaces); +} + /* * Print a dependency chain entry (this is only done when a deadlock * has been detected): @@ -1091,8 +1433,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); - print_stack_trace(&target->trace, 6); - + print_lock_trace(&target->trace, 6); return 0; } @@ -1186,10 +1527,9 @@ static inline int class_equal(struct lock_list *entry, void *data) } static noinline int print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt, - struct stack_trace *trace) + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1329,19 +1669,25 @@ check_redundant(struct lock_list *root, struct lock_class *target, } #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + +static inline int usage_accumulate(struct lock_list *entry, void *mask) +{ + *(unsigned long *)mask |= entry->class->usage_mask; + + return 0; +} + /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static inline int usage_match(struct lock_list *entry, void *bit) +static inline int usage_match(struct lock_list *entry, void *mask) { - return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); + return entry->class->usage_mask & *(unsigned long *)mask; } - - /* * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. @@ -1353,14 +1699,14 @@ static inline int usage_match(struct lock_list *entry, void *bit) * Return <0 on error. */ static int -find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_forwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { int result; debug_atomic_inc(nr_find_usage_forwards_checks); - result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + result = __bfs_forwards(root, &usage_mask, usage_match, target_entry); return result; } @@ -1376,14 +1722,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, * Return <0 on error. */ static int -find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_backwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { int result; debug_atomic_inc(nr_find_usage_backwards_checks); - result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); + result = __bfs_backwards(root, &usage_mask, usage_match, target_entry); return result; } @@ -1405,7 +1751,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(KERN_CONT " at:\n"); - print_stack_trace(class->usage_traces + bit, len); + print_lock_trace(class->usage_traces + bit, len); } } printk("%*s }\n", depth, ""); @@ -1430,7 +1776,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, do { print_lock_class_header(entry->class, depth); printk("%*s ... acquired at:\n", depth, ""); - print_stack_trace(&entry->trace, 2); + print_lock_trace(&entry->trace, 2); printk("\n"); if (depth == 0 && (entry != root)) { @@ -1543,14 +1889,14 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock_name(backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); - print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); + print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); - print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); + print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -1575,39 +1921,6 @@ print_bad_irq_dependency(struct task_struct *curr, return 0; } -static int -check_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit_backwards, - enum lock_usage_bit bit_forwards, const char *irqclass) -{ - int ret; - struct lock_list this, that; - struct lock_list *uninitialized_var(target_entry); - struct lock_list *uninitialized_var(target_entry1); - - this.parent = NULL; - - this.class = hlock_class(prev); - ret = find_usage_backwards(&this, bit_backwards, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - that.parent = NULL; - that.class = hlock_class(next); - ret = find_usage_forwards(&that, bit_forwards, &target_entry1); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_bad_irq_dependency(curr, &this, &that, - target_entry, target_entry1, - prev, next, - bit_backwards, bit_forwards, irqclass); -} - static const char *state_names[] = { #define LOCKDEP_STATE(__STATE) \ __stringify(__STATE), @@ -1624,70 +1937,184 @@ static const char *state_rnames[] = { static inline const char *state_name(enum lock_usage_bit bit) { - return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; + if (bit & LOCK_USAGE_READ_MASK) + return state_rnames[bit >> LOCK_USAGE_DIR_MASK]; + else + return state_names[bit >> LOCK_USAGE_DIR_MASK]; } +/* + * The bit number is encoded like: + * + * bit0: 0 exclusive, 1 read lock + * bit1: 0 used in irq, 1 irq enabled + * bit2-n: state + */ static int exclusive_bit(int new_bit) { - /* - * USED_IN - * USED_IN_READ - * ENABLED - * ENABLED_READ - * - * bit 0 - write/read - * bit 1 - used_in/enabled - * bit 2+ state - */ - - int state = new_bit & ~3; - int dir = new_bit & 2; + int state = new_bit & LOCK_USAGE_STATE_MASK; + int dir = new_bit & LOCK_USAGE_DIR_MASK; /* * keep state, bit flip the direction and strip read. */ - return state | (dir ^ 2); + return state | (dir ^ LOCK_USAGE_DIR_MASK); +} + +/* + * Observe that when given a bitmask where each bitnr is encoded as above, a + * right shift of the mask transforms the individual bitnrs as -1 and + * conversely, a left shift transforms into +1 for the individual bitnrs. + * + * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can + * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0) + * instead by subtracting the bit number by 2, or shifting the mask right by 2. + * + * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2. + * + * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is + * all bits set) and recompose with bitnr1 flipped. + */ +static unsigned long invert_dir_mask(unsigned long mask) +{ + unsigned long excl = 0; + + /* Invert dir */ + excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK; + excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK; + + return excl; } +/* + * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all + * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). + * And then mask out all bitnr0. + */ +static unsigned long exclusive_mask(unsigned long mask) +{ + unsigned long excl = invert_dir_mask(mask); + + /* Strip read */ + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; + excl &= ~LOCKF_IRQ_READ; + + return excl; +} + +/* + * Retrieve the _possible_ original mask to which @mask is + * exclusive. Ie: this is the opposite of exclusive_mask(). + * Note that 2 possible original bits can match an exclusive + * bit: one has LOCK_USAGE_READ_MASK set, the other has it + * cleared. So both are returned for each exclusive bit. + */ +static unsigned long original_mask(unsigned long mask) +{ + unsigned long excl = invert_dir_mask(mask); + + /* Include read in existing usages */ + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; + + return excl; +} + +/* + * Find the first pair of bit match between an original + * usage mask and an exclusive usage mask. + */ +static int find_exclusive_match(unsigned long mask, + unsigned long excl_mask, + enum lock_usage_bit *bitp, + enum lock_usage_bit *excl_bitp) +{ + int bit, excl; + + for_each_set_bit(bit, &mask, LOCK_USED) { + excl = exclusive_bit(bit); + if (excl_mask & lock_flag(excl)) { + *bitp = bit; + *excl_bitp = excl; + return 0; + } + } + return -1; +} + +/* + * Prove that the new dependency does not connect a hardirq-safe(-read) + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit) + struct held_lock *next) { + unsigned long usage_mask = 0, forward_mask, backward_mask; + enum lock_usage_bit forward_bit = 0, backward_bit = 0; + struct lock_list *uninitialized_var(target_entry1); + struct lock_list *uninitialized_var(target_entry); + struct lock_list this, that; + int ret; + /* - * Prove that the new dependency does not connect a hardirq-safe - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: + * Step 1: gather all hard/soft IRQs usages backward in an + * accumulated usage mask. */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) - return 0; + this.parent = NULL; + this.class = hlock_class(prev); + + ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); + if (ret < 0) + return print_bfs_bug(ret); - bit++; /* _READ */ + usage_mask &= LOCKF_USED_IN_IRQ_ALL; + if (!usage_mask) + return 1; /* - * Prove that the new dependency does not connect a hardirq-safe-read - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: + * Step 2: find exclusive uses forward that match the previous + * backward accumulated mask. */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) - return 0; + forward_mask = exclusive_mask(usage_mask); - return 1; -} + that.parent = NULL; + that.class = hlock_class(next); -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ -#define LOCKDEP_STATE(__STATE) \ - if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ - return 0; -#include "lockdep_states.h" -#undef LOCKDEP_STATE + ret = find_usage_forwards(&that, forward_mask, &target_entry1); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; - return 1; + /* + * Step 3: we found a bad match! Now retrieve a lock from the backward + * list whose usage mask matches the exclusive usage mask from the + * lock found on the forward list. + */ + backward_mask = original_mask(target_entry1->class->usage_mask); + + ret = find_usage_backwards(&this, backward_mask, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (DEBUG_LOCKS_WARN_ON(ret == 1)) + return 1; + + /* + * Step 4: narrow down to a pair of incompatible usage bits + * and report it. + */ + ret = find_exclusive_match(target_entry->class->usage_mask, + target_entry1->class->usage_mask, + &backward_bit, &forward_bit); + if (DEBUG_LOCKS_WARN_ON(ret == -1)) + return 1; + + return print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, + backward_bit, forward_bit, + state_name(backward_bit)); } static void inc_chains(void) @@ -1704,9 +2131,8 @@ static void inc_chains(void) #else -static inline int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) +static inline int check_irq_usage(struct task_struct *curr, + struct held_lock *prev, struct held_lock *next) { return 1; } @@ -1834,14 +2260,31 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, struct stack_trace *trace, - int (*save)(struct stack_trace *trace)) + struct held_lock *next, int distance, struct lock_trace *trace) { struct lock_list *uninitialized_var(target_entry); struct lock_list *entry; struct lock_list this; int ret; + if (!hlock_class(prev)->key || !hlock_class(next)->key) { + /* + * The warning statements below may trigger a use-after-free + * of the class name. It is better to trigger a use-after free + * and to have the class name most of the time instead of not + * having the class name available. + */ + WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key, + "Detected use-after-free of lock class %px/%s\n", + hlock_class(prev), + hlock_class(prev)->name); + WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key, + "Detected use-after-free of lock class %px/%s\n", + hlock_class(next), + hlock_class(next)->name); + return 2; + } + /* * Prove that the new <prev> -> <next> dependency would not * create a circular dependency in the graph. (We do this by @@ -1855,20 +2298,20 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, this.parent = NULL; ret = check_noncircular(&this, hlock_class(prev), &target_entry); if (unlikely(!ret)) { - if (!trace->entries) { + if (!trace->nr_entries) { /* - * If @save fails here, the printing might trigger - * a WARN but because of the !nr_entries it should - * not do bad things. + * If save_trace fails here, the printing might + * trigger a WARN but because of the !nr_entries it + * should not do bad things. */ - save(trace); + save_trace(trace); } - return print_circular_bug(&this, target_entry, next, prev, trace); + return print_circular_bug(&this, target_entry, next, prev); } else if (unlikely(ret < 0)) return print_bfs_bug(ret); - if (!check_prev_add_irq(curr, prev, next)) + if (!check_irq_usage(curr, prev, next)) return 0; /* @@ -1911,21 +2354,21 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return print_bfs_bug(ret); - if (!trace->entries && !save(trace)) + if (!trace->nr_entries && !save_trace(trace)) return 0; /* * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(hlock_class(next), + ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, next->acquire_ip, distance, trace); if (!ret) return 0; - ret = add_lock_to_list(hlock_class(prev), + ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, next->acquire_ip, distance, trace); if (!ret) @@ -1943,14 +2386,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { + struct lock_trace trace = { .nr_entries = 0 }; int depth = curr->lockdep_depth; struct held_lock *hlock; - struct stack_trace trace = { - .nr_entries = 0, - .max_entries = 0, - .entries = NULL, - .skip = 0, - }; /* * Debugging checks. @@ -1976,7 +2414,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) * added: */ if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); + int ret = check_prev_add(curr, hlock, next, distance, + &trace); if (!ret) return 0; @@ -2018,8 +2457,8 @@ out_bug: return 0; } -unsigned long nr_lock_chains; struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS); int nr_chain_hlocks; static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; @@ -2153,6 +2592,33 @@ static int check_no_collision(struct task_struct *curr, } /* + * Given an index that is >= -1, return the index of the next lock chain. + * Return -2 if there is no next lock chain. + */ +long lockdep_next_lockchain(long i) +{ + i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1); + return i < ARRAY_SIZE(lock_chains) ? i : -2; +} + +unsigned long lock_chain_count(void) +{ + return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains)); +} + +/* Must be called with the graph lock held. */ +static struct lock_chain *alloc_lock_chain(void) +{ + int idx = find_first_zero_bit(lock_chains_in_use, + ARRAY_SIZE(lock_chains)); + + if (unlikely(idx >= ARRAY_SIZE(lock_chains))) + return NULL; + __set_bit(idx, lock_chains_in_use); + return lock_chains + idx; +} + +/* * Adds a dependency chain into chain hashtable. And must be called with * graph_lock held. * @@ -2169,19 +2635,15 @@ static inline int add_chain_cache(struct task_struct *curr, int i, j; /* - * Allocate a new chain entry from the static array, and add - * it to the hash: - */ - - /* - * We might need to take the graph lock, ensure we've got IRQs + * The caller must hold the graph lock, ensure we've got IRQs * disabled to make this an IRQ-safe lock.. for recursion reasons * lockdep won't complain about its own locking errors. */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + chain = alloc_lock_chain(); + if (!chain) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2189,7 +2651,6 @@ static inline int add_chain_cache(struct task_struct *curr, dump_stack(); return 0; } - chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; i = get_first_held_lock(curr, hlock); @@ -2206,16 +2667,8 @@ static inline int add_chain_cache(struct task_struct *curr, chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; - } - - if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS) nr_chain_hlocks += chain->depth; - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * Important for check_no_collision(). - */ - if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { + } else { if (!debug_locks_off_graph_unlock()) return 0; @@ -2223,7 +2676,6 @@ static inline int add_chain_cache(struct task_struct *curr, dump_stack(); return 0; } -#endif hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); @@ -2233,19 +2685,16 @@ static inline int add_chain_cache(struct task_struct *curr, } /* - * Look up a dependency chain. + * Look up a dependency chain. Must be called with either the graph lock or + * the RCU read lock held. */ static inline struct lock_chain *lookup_chain_cache(u64 chain_key) { struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - /* - * We can walk it lock-free, because entries only get added - * to the hash: - */ hlist_for_each_entry_rcu(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { + if (READ_ONCE(chain->chain_key) == chain_key) { debug_atomic_inc(chain_lookup_hits); return chain; } @@ -2367,6 +2816,10 @@ static inline int validate_chain(struct task_struct *curr, { return 1; } + +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ +} #endif /* @@ -2420,6 +2873,12 @@ static void check_chain_key(struct task_struct *curr) #endif } +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit); + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + + static void print_usage_bug_scenario(struct held_lock *lock) { @@ -2463,7 +2922,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); + print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); pr_warn("\nother info that might help us debug this:\n"); @@ -2489,10 +2948,6 @@ valid_state(struct task_struct *curr, struct held_lock *this, return 1; } -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit); - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: @@ -2572,7 +3027,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); - ret = find_usage_forwards(&root, bit, &target_entry); + ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); if (ret < 0) return print_bfs_bug(ret); if (ret == 1) @@ -2596,7 +3051,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); - ret = find_usage_backwards(&root, bit, &target_entry); + ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); if (ret < 0) return print_bfs_bug(ret); if (ret == 1) @@ -2651,7 +3106,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = { static inline int state_verbose(enum lock_usage_bit bit, struct lock_class *class) { - return state_verbose_f[bit >> 2](class); + return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class); } typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, @@ -2662,8 +3117,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { int excl_bit = exclusive_bit(new_bit); - int read = new_bit & 1; - int dir = new_bit & 2; + int read = new_bit & LOCK_USAGE_READ_MASK; + int dir = new_bit & LOCK_USAGE_DIR_MASK; /* * mark USED_IN has to look forwards -- to ensure no dependency @@ -2687,19 +3142,19 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, * states. */ if ((!read || !dir || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~1))) + !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) return 0; /* * Check for read in write conflicts */ if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + 1)) + if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK)) return 0; if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + 1, - state_name(new_bit + 1))) + !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK, + state_name(new_bit + LOCK_USAGE_READ_MASK))) return 0; } @@ -2709,35 +3164,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 1; } -enum mark_type { -#define LOCKDEP_STATE(__STATE) __STATE, -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - /* * Mark all held locks with a usage bit: */ static int -mark_held_locks(struct task_struct *curr, enum mark_type mark) +mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) { - enum lock_usage_bit usage_bit; struct held_lock *hlock; int i; for (i = 0; i < curr->lockdep_depth; i++) { + enum lock_usage_bit hlock_bit = base_bit; hlock = curr->held_locks + i; - usage_bit = 2 + (mark << 2); /* ENABLED */ if (hlock->read) - usage_bit += 1; /* READ */ + hlock_bit += LOCK_USAGE_READ_MASK; - BUG_ON(usage_bit >= LOCK_USAGE_STATES); + BUG_ON(hlock_bit >= LOCK_USAGE_STATES); if (!hlock->check) continue; - if (!mark_lock(curr, hlock, usage_bit)) + if (!mark_lock(curr, hlock, hlock_bit)) return 0; } @@ -2758,7 +3206,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ - if (!mark_held_locks(curr, HARDIRQ)) + if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ)) return; /* * If we have softirqs enabled, then set the usage @@ -2766,7 +3214,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, SOFTIRQ)) + if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ)) return; curr->hardirq_enable_ip = ip; @@ -2800,7 +3248,7 @@ void lockdep_hardirqs_on(unsigned long ip) /* * See the fine text that goes along with this variable definition. */ - if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) + if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled)) return; /* @@ -2814,6 +3262,7 @@ void lockdep_hardirqs_on(unsigned long ip) __trace_hardirqs_on_caller(ip); current->lockdep_recursion = 0; } +NOKPROBE_SYMBOL(lockdep_hardirqs_on); /* * Hardirqs were disabled: @@ -2843,6 +3292,7 @@ void lockdep_hardirqs_off(unsigned long ip) } else debug_atomic_inc(redundant_hardirqs_off); } +NOKPROBE_SYMBOL(lockdep_hardirqs_off); /* * Softirqs will be enabled: @@ -2880,7 +3330,7 @@ void trace_softirqs_on(unsigned long ip) * enabled too: */ if (curr->hardirqs_enabled) - mark_held_locks(curr, SOFTIRQ); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); current->lockdep_recursion = 0; } @@ -3119,13 +3569,12 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (DEBUG_LOCKS_WARN_ON(!key)) return; /* - * Sanity check, the lock-class key must be persistent: + * Sanity check, the lock-class key must either have been allocated + * statically or must have been registered as a dynamic key. */ - if (!static_obj(key)) { - printk("BUG: key %px not in .data!\n", key); - /* - * What it says above ^^^^^, I suggest you read it. - */ + if (!static_obj(key) && !is_dynamic_key(key)) { + if (debug_locks) + printk(KERN_ERR "BUG: key %px has not been registered!\n", key); DEBUG_LOCKS_WARN_ON(1); return; } @@ -3335,6 +3784,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (nest_lock && !__lock_is_held(nest_lock, -1)) return print_lock_nested_lock_not_held(curr, hlock, ip); + if (!debug_locks_silent) { + WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); + WARN_ON_ONCE(!hlock_class(hlock)->key); + } + if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; @@ -3497,6 +3951,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, @@ -3535,6 +3992,9 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, @@ -3650,7 +4110,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) return 0; } -static int __lock_is_held(const struct lockdep_map *lock, int read) +static nokprobe_inline +int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3883,6 +4344,7 @@ int lock_is_held_type(const struct lockdep_map *lock, int read) return ret; } EXPORT_SYMBOL_GPL(lock_is_held_type); +NOKPROBE_SYMBOL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { @@ -4123,29 +4585,131 @@ void lockdep_reset(void) raw_local_irq_restore(flags); } +/* Remove a class from a lock chain. Must be called with the graph lock held. */ +static void remove_class_from_lock_chain(struct pending_free *pf, + struct lock_chain *chain, + struct lock_class *class) +{ +#ifdef CONFIG_PROVE_LOCKING + struct lock_chain *new_chain; + u64 chain_key; + int i; + + for (i = chain->base; i < chain->base + chain->depth; i++) { + if (chain_hlocks[i] != class - lock_classes) + continue; + /* The code below leaks one chain_hlock[] entry. */ + if (--chain->depth > 0) { + memmove(&chain_hlocks[i], &chain_hlocks[i + 1], + (chain->base + chain->depth - i) * + sizeof(chain_hlocks[0])); + } + /* + * Each lock class occurs at most once in a lock chain so once + * we found a match we can break out of this loop. + */ + goto recalc; + } + /* Since the chain has not been modified, return. */ + return; + +recalc: + chain_key = 0; + for (i = chain->base; i < chain->base + chain->depth; i++) + chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + if (chain->depth && chain->chain_key == chain_key) + return; + /* Overwrite the chain key for concurrent RCU readers. */ + WRITE_ONCE(chain->chain_key, chain_key); + /* + * Note: calling hlist_del_rcu() from inside a + * hlist_for_each_entry_rcu() loop is safe. + */ + hlist_del_rcu(&chain->entry); + __set_bit(chain - lock_chains, pf->lock_chains_being_freed); + if (chain->depth == 0) + return; + /* + * If the modified lock chain matches an existing lock chain, drop + * the modified lock chain. + */ + if (lookup_chain_cache(chain_key)) + return; + new_chain = alloc_lock_chain(); + if (WARN_ON_ONCE(!new_chain)) { + debug_locks_off(); + return; + } + *new_chain = *chain; + hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key)); +#endif +} + +/* Must be called with the graph lock held. */ +static void remove_class_from_lock_chains(struct pending_free *pf, + struct lock_class *class) +{ + struct lock_chain *chain; + struct hlist_head *head; + int i; + + for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { + head = chainhash_table + i; + hlist_for_each_entry_rcu(chain, head, entry) { + remove_class_from_lock_chain(pf, chain, class); + } + } +} + /* * Remove all references to a lock class. The caller must hold the graph lock. */ -static void zap_class(struct lock_class *class) +static void zap_class(struct pending_free *pf, struct lock_class *class) { + struct lock_list *entry; int i; + WARN_ON_ONCE(!class->key); + /* * Remove all dependencies this lock is * involved in: */ - for (i = 0; i < nr_list_entries; i++) { - if (list_entries[i].class == class) - list_del_rcu(&list_entries[i].entry); + for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + entry = list_entries + i; + if (entry->class != class && entry->links_to != class) + continue; + __clear_bit(i, list_entries_in_use); + nr_list_entries--; + list_del_rcu(&entry->entry); } - /* - * Unhash the class and remove it from the all_lock_classes list: - */ - hlist_del_rcu(&class->hash_entry); - list_del(&class->lock_entry); + if (list_empty(&class->locks_after) && + list_empty(&class->locks_before)) { + list_move_tail(&class->lock_entry, &pf->zapped); + hlist_del_rcu(&class->hash_entry); + WRITE_ONCE(class->key, NULL); + WRITE_ONCE(class->name, NULL); + nr_lock_classes--; + } else { + WARN_ONCE(true, "%s() failed for class %s\n", __func__, + class->name); + } + + remove_class_from_lock_chains(pf, class); +} - RCU_INIT_POINTER(class->key, NULL); - RCU_INIT_POINTER(class->name, NULL); +static void reinit_class(struct lock_class *class) +{ + void *const p = class; + const unsigned int offset = offsetof(struct lock_class, key); + + WARN_ON_ONCE(!class->lock_entry.next); + WARN_ON_ONCE(!list_empty(&class->locks_after)); + WARN_ON_ONCE(!list_empty(&class->locks_before)); + memset(p + offset, 0, sizeof(*class) - offset); + WARN_ON_ONCE(!class->lock_entry.next); + WARN_ON_ONCE(!list_empty(&class->locks_after)); + WARN_ON_ONCE(!list_empty(&class->locks_before)); } static inline int within(const void *addr, void *start, unsigned long size) @@ -4153,55 +4717,171 @@ static inline int within(const void *addr, void *start, unsigned long size) return addr >= start && addr < start + size; } +static bool inside_selftest(void) +{ + return current == lockdep_selftest_task_struct; +} + +/* The caller must hold the graph lock. */ +static struct pending_free *get_pending_free(void) +{ + return delayed_free.pf + delayed_free.index; +} + +static void free_zapped_rcu(struct rcu_head *cb); + /* - * Used in module.c to remove lock classes from memory that is going to be - * freed; and possibly re-used by other modules. - * - * We will have had one sync_sched() before getting here, so we're guaranteed - * nobody will look up these exact classes -- they're properly dead but still - * allocated. + * Schedule an RCU callback if no RCU callback is pending. Must be called with + * the graph lock held. */ -void lockdep_free_key_range(void *start, unsigned long size) +static void call_rcu_zapped(struct pending_free *pf) +{ + WARN_ON_ONCE(inside_selftest()); + + if (list_empty(&pf->zapped)) + return; + + if (delayed_free.scheduled) + return; + + delayed_free.scheduled = true; + + WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); + delayed_free.index ^= 1; + + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); +} + +/* The caller must hold the graph lock. May be called from RCU context. */ +static void __free_zapped_classes(struct pending_free *pf) { struct lock_class *class; - struct hlist_head *head; + + check_data_structures(); + + list_for_each_entry(class, &pf->zapped, lock_entry) + reinit_class(class); + + list_splice_init(&pf->zapped, &free_lock_classes); + +#ifdef CONFIG_PROVE_LOCKING + bitmap_andnot(lock_chains_in_use, lock_chains_in_use, + pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains)); + bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains)); +#endif +} + +static void free_zapped_rcu(struct rcu_head *ch) +{ + struct pending_free *pf; unsigned long flags; - int i; - int locked; + + if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) + return; raw_local_irq_save(flags); - locked = graph_lock(); + arch_spin_lock(&lockdep_lock); + current->lockdep_recursion = 1; + + /* closed head */ + pf = delayed_free.pf + (delayed_free.index ^ 1); + __free_zapped_classes(pf); + delayed_free.scheduled = false; /* - * Unhash all classes that were created by this module: + * If there's anything on the open list, close and start a new callback. */ + call_rcu_zapped(delayed_free.pf + delayed_free.index); + + current->lockdep_recursion = 0; + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +/* + * Remove all lock classes from the class hash table and from the + * all_lock_classes list whose key or name is in the address range [start, + * start + size). Move these lock classes to the zapped_classes list. Must + * be called with the graph lock held. + */ +static void __lockdep_free_key_range(struct pending_free *pf, void *start, + unsigned long size) +{ + struct lock_class *class; + struct hlist_head *head; + int i; + + /* Unhash all classes that were created by a module. */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; hlist_for_each_entry_rcu(class, head, hash_entry) { - if (within(class->key, start, size)) - zap_class(class); - else if (within(class->name, start, size)) - zap_class(class); + if (!within(class->key, start, size) && + !within(class->name, start, size)) + continue; + zap_class(pf, class); } } +} - if (locked) - graph_unlock(); +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one synchronize_rcu() before getting here, so we're + * guaranteed nobody will look up these exact classes -- they're properly dead + * but still allocated. + */ +static void lockdep_free_key_range_reg(void *start, unsigned long size) +{ + struct pending_free *pf; + unsigned long flags; + + init_data_structures_once(); + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + current->lockdep_recursion = 1; + pf = get_pending_free(); + __lockdep_free_key_range(pf, start, size); + call_rcu_zapped(pf); + current->lockdep_recursion = 0; + arch_spin_unlock(&lockdep_lock); raw_local_irq_restore(flags); /* * Wait for any possible iterators from look_up_lock_class() to pass * before continuing to free the memory they refer to. - * - * sync_sched() is sufficient because the read-side is IRQ disable. */ synchronize_rcu(); +} - /* - * XXX at this point we could return the resources to the pool; - * instead we leak them. We would need to change to bitmap allocators - * instead of the linear allocators we have now. - */ +/* + * Free all lockdep keys in the range [start, start+size). Does not sleep. + * Ignores debug_locks. Must only be used by the lockdep selftests. + */ +static void lockdep_free_key_range_imm(void *start, unsigned long size) +{ + struct pending_free *pf = delayed_free.pf; + unsigned long flags; + + init_data_structures_once(); + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + __lockdep_free_key_range(pf, start, size); + __free_zapped_classes(pf); + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +void lockdep_free_key_range(void *start, unsigned long size) +{ + init_data_structures_once(); + + if (inside_selftest()) + lockdep_free_key_range_imm(start, size); + else + lockdep_free_key_range_reg(start, size); } /* @@ -4226,14 +4906,12 @@ static bool lock_class_cache_is_registered(struct lockdep_map *lock) return false; } -void lockdep_reset_lock(struct lockdep_map *lock) +/* The caller must hold the graph lock. Does not sleep. */ +static void __lockdep_reset_lock(struct pending_free *pf, + struct lockdep_map *lock) { struct lock_class *class; - unsigned long flags; - int j, locked; - - raw_local_irq_save(flags); - locked = graph_lock(); + int j; /* * Remove all classes this lock might have: @@ -4244,27 +4922,104 @@ void lockdep_reset_lock(struct lockdep_map *lock) */ class = look_up_lock_class(lock, j); if (class) - zap_class(class); + zap_class(pf, class); } /* * Debug check: in the end all mapped classes should * be gone. */ - if (unlikely(lock_class_cache_is_registered(lock))) { - if (debug_locks_off_graph_unlock()) { - /* - * We all just reset everything, how did it match? - */ - WARN_ON(1); + if (WARN_ON_ONCE(lock_class_cache_is_registered(lock))) + debug_locks_off(); +} + +/* + * Remove all information lockdep has about a lock if debug_locks == 1. Free + * released data structures from RCU context. + */ +static void lockdep_reset_lock_reg(struct lockdep_map *lock) +{ + struct pending_free *pf; + unsigned long flags; + int locked; + + raw_local_irq_save(flags); + locked = graph_lock(); + if (!locked) + goto out_irq; + + pf = get_pending_free(); + __lockdep_reset_lock(pf, lock); + call_rcu_zapped(pf); + + graph_unlock(); +out_irq: + raw_local_irq_restore(flags); +} + +/* + * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the + * lockdep selftests. + */ +static void lockdep_reset_lock_imm(struct lockdep_map *lock) +{ + struct pending_free *pf = delayed_free.pf; + unsigned long flags; + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + __lockdep_reset_lock(pf, lock); + __free_zapped_classes(pf); + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ + init_data_structures_once(); + + if (inside_selftest()) + lockdep_reset_lock_imm(lock); + else + lockdep_reset_lock_reg(lock); +} + +/* Unregister a dynamically allocated key. */ +void lockdep_unregister_key(struct lock_class_key *key) +{ + struct hlist_head *hash_head = keyhashentry(key); + struct lock_class_key *k; + struct pending_free *pf; + unsigned long flags; + bool found = false; + + might_sleep(); + + if (WARN_ON_ONCE(static_obj(key))) + return; + + raw_local_irq_save(flags); + if (!graph_lock()) + goto out_irq; + + pf = get_pending_free(); + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (k == key) { + hlist_del_rcu(&k->hash_entry); + found = true; + break; } - goto out_restore; } - if (locked) - graph_unlock(); - -out_restore: + WARN_ON_ONCE(!found); + __lockdep_free_key_range(pf, key, 1); + call_rcu_zapped(pf); + graph_unlock(); +out_irq: raw_local_irq_restore(flags); + + /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ + synchronize_rcu(); } +EXPORT_SYMBOL_GPL(lockdep_unregister_key); void __init lockdep_init(void) { @@ -4278,20 +5033,24 @@ void __init lockdep_init(void) printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); - printk(" memory used by lock dependency info: %lu kB\n", - (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + - sizeof(struct list_head) * CLASSHASH_SIZE + - sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + - sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE + printk(" memory used by lock dependency info: %zu kB\n", + (sizeof(lock_classes) + + sizeof(classhash_table) + + sizeof(list_entries) + + sizeof(list_entries_in_use) + + sizeof(chainhash_table) + + sizeof(delayed_free) #ifdef CONFIG_PROVE_LOCKING - + sizeof(struct circular_queue) + + sizeof(lock_cq) + + sizeof(lock_chains) + + sizeof(lock_chains_in_use) + + sizeof(chain_hlocks) #endif ) / 1024 ); - printk(" per task-struct memory footprint: %lu bytes\n", - sizeof(struct held_lock) * MAX_LOCK_DEPTH); + printk(" per task-struct memory footprint: %zu bytes\n", + sizeof(((struct task_struct *)NULL)->held_locks)); } static void diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 88c847a41c8a..150ec3f0c5b5 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -22,6 +22,10 @@ enum lock_usage_bit { LOCK_USAGE_STATES }; +#define LOCK_USAGE_READ_MASK 1 +#define LOCK_USAGE_DIR_MASK 2 +#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) + /* * Usage-state bitmasks: */ @@ -38,13 +42,35 @@ enum { __LOCKF(USED) }; -#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) -#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) +#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | +static const unsigned long LOCKF_ENABLED_IRQ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE | +static const unsigned long LOCKF_USED_IN_IRQ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ | +static const unsigned long LOCKF_ENABLED_IRQ_READ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ | +static const unsigned long LOCKF_USED_IN_IRQ_READ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) +#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) -#define LOCKF_ENABLED_IRQ_READ \ - (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) -#define LOCKF_USED_IN_IRQ_READ \ - (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) +#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ) +#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ) /* * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, @@ -96,7 +122,8 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); extern unsigned long nr_lock_classes; extern unsigned long nr_list_entries; -extern unsigned long nr_lock_chains; +long lockdep_next_lockchain(long i); +unsigned long lock_chain_count(void); extern int nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 3d31f9b0059e..9c49ec645d8b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -104,18 +104,18 @@ static const struct seq_operations lockdep_ops = { #ifdef CONFIG_PROVE_LOCKING static void *lc_start(struct seq_file *m, loff_t *pos) { + if (*pos < 0) + return NULL; + if (*pos == 0) return SEQ_START_TOKEN; - if (*pos - 1 < nr_lock_chains) - return lock_chains + (*pos - 1); - - return NULL; + return lock_chains + (*pos - 1); } static void *lc_next(struct seq_file *m, void *v, loff_t *pos) { - (*pos)++; + *pos = lockdep_next_lockchain(*pos - 1) + 1; return lc_start(m, pos); } @@ -268,7 +268,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", - nr_lock_chains, MAX_LOCKDEP_CHAINS); + lock_chain_count(), MAX_LOCKDEP_CHAINS); seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); #endif diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 7d0b0ed74404..80a463d31a8d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Module-based torture test facility for locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2014 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Davidlohr Bueso <dave@stgolabs.net> * Based on kernel/rcu/torture.c. */ @@ -45,7 +32,7 @@ #include <linux/torture.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); @@ -842,7 +829,9 @@ static void lock_torture_cleanup(void) "End of test: SUCCESS"); kfree(cxt.lwsa); + cxt.lwsa = NULL; kfree(cxt.lrsa); + cxt.lrsa = NULL; end: torture_cleanup_end(); @@ -970,7 +959,7 @@ static int __init lock_torture_init(void) /* Prepare torture context. */ if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, - onoff_interval * HZ); + onoff_interval * HZ, NULL); if (firsterr) goto unwind; } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 883cf1b92d90..f17dad99eec8 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -7,6 +7,8 @@ #include <linux/sched.h> #include <linux/errno.h> +#include "rwsem.h" + int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 8a8c3c208c5e..e14b32c69639 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -124,9 +124,6 @@ static inline __pure u32 encode_tail(int cpu, int idx) { u32 tail; -#ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(idx > 3); -#endif tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ @@ -398,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * 0,1,0 -> 0,0,1 */ clear_pending_set_locked(lock); - qstat_inc(qstat_lock_pending, true); + lockevent_inc(lock_pending); return; /* @@ -406,18 +403,34 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * queuing. */ queue: - qstat_inc(qstat_lock_slowpath, true); + lockevent_inc(lock_slowpath); pv_queue: node = this_cpu_ptr(&qnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); + /* + * 4 nodes are allocated based on the assumption that there will + * not be nested NMIs taking spinlocks. That may not be true in + * some architectures even though the chance of needing more than + * 4 nodes will still be extremely unlikely. When that happens, + * we fall back to spinning on the lock directly without using + * any MCS node. This is not the most elegant solution, but is + * simple enough. + */ + if (unlikely(idx >= MAX_NODES)) { + lockevent_inc(lock_no_node); + while (!queued_spin_trylock(lock)) + cpu_relax(); + goto release; + } + node = grab_mcs_node(node, idx); /* * Keep counts of non-zero index values: */ - qstat_inc(qstat_lock_idx1 + idx - 1, idx); + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); /* * Ensure that we increment the head node->count before initialising diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8f36c27c1794..89bab079e7a4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) if (!(val & _Q_LOCKED_PENDING_MASK) && (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { - qstat_inc(qstat_pv_lock_stealing, true); + lockevent_inc(pv_lock_stealing); return true; } if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) @@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) hopcnt++; if (!cmpxchg(&he->lock, NULL, lock)) { WRITE_ONCE(he->node, node); - qstat_hop(hopcnt); + lockevent_pv_hop(hopcnt); return &he->lock; } } @@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) smp_store_mb(pn->state, vcpu_halted); if (!READ_ONCE(node->locked)) { - qstat_inc(qstat_pv_wait_node, true); - qstat_inc(qstat_pv_wait_early, wait_early); + lockevent_inc(pv_wait_node); + lockevent_cond_inc(pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); } @@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) * So it is better to spin for a while in the hope that the * MCS lock will be released soon. */ - qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); + lockevent_cond_inc(pv_spurious_wakeup, + !READ_ONCE(node->locked)); } /* @@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) /* * Tracking # of slowpath locking operations */ - qstat_inc(qstat_lock_slowpath, true); + lockevent_inc(lock_slowpath); for (;; waitcnt++) { /* @@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) } } WRITE_ONCE(pn->state, vcpu_hashed); - qstat_inc(qstat_pv_wait_head, true); - qstat_inc(qstat_pv_wait_again, waitcnt); + lockevent_inc(pv_wait_head); + lockevent_cond_inc(pv_wait_again, waitcnt); pv_wait(&lock->locked, _Q_SLOW_VAL); /* @@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * vCPU is harmless other than the additional latency in completing * the unlock. */ - qstat_inc(qstat_pv_kick_unlock, true); + lockevent_inc(pv_kick_unlock); pv_kick(node->cpu); } diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 42d3d8dc8f49..54152670ff24 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -9,253 +9,105 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * Authors: Waiman Long <waiman.long@hpe.com> + * Authors: Waiman Long <longman@redhat.com> */ -/* - * When queued spinlock statistical counters are enabled, the following - * debugfs files will be created for reporting the counter values: - * - * <debugfs>/qlockstat/ - * pv_hash_hops - average # of hops per hashing operation - * pv_kick_unlock - # of vCPU kicks issued at unlock time - * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake - * pv_latency_kick - average latency (ns) of vCPU kick operation - * pv_latency_wake - average latency (ns) from vCPU kick to wakeup - * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs - * pv_wait_again - # of wait's after a queue head vCPU kick - * pv_wait_early - # of early vCPU wait's - * pv_wait_head - # of vCPU wait's at the queue head - * pv_wait_node - # of vCPU wait's at a non-head queue node - * lock_pending - # of locking operations via pending code - * lock_slowpath - # of locking operations via MCS lock queue - * - * Writing to the "reset_counters" file will reset all the above counter - * values. - * - * These statistical counters are implemented as per-cpu variables which are - * summed and computed whenever the corresponding debugfs files are read. This - * minimizes added overhead making the counters usable even in a production - * environment. - * - * There may be slight difference between pv_kick_wake and pv_kick_unlock. - */ -enum qlock_stats { - qstat_pv_hash_hops, - qstat_pv_kick_unlock, - qstat_pv_kick_wake, - qstat_pv_latency_kick, - qstat_pv_latency_wake, - qstat_pv_lock_stealing, - qstat_pv_spurious_wakeup, - qstat_pv_wait_again, - qstat_pv_wait_early, - qstat_pv_wait_head, - qstat_pv_wait_node, - qstat_lock_pending, - qstat_lock_slowpath, - qstat_lock_idx1, - qstat_lock_idx2, - qstat_lock_idx3, - qstat_num, /* Total number of statistical counters */ - qstat_reset_cnts = qstat_num, -}; +#include "lock_events.h" -#ifdef CONFIG_QUEUED_LOCK_STAT +#ifdef CONFIG_LOCK_EVENT_COUNTS +#ifdef CONFIG_PARAVIRT_SPINLOCKS /* - * Collect pvqspinlock statistics + * Collect pvqspinlock locking event counts */ -#include <linux/debugfs.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/fs.h> -static const char * const qstat_names[qstat_num + 1] = { - [qstat_pv_hash_hops] = "pv_hash_hops", - [qstat_pv_kick_unlock] = "pv_kick_unlock", - [qstat_pv_kick_wake] = "pv_kick_wake", - [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", - [qstat_pv_latency_kick] = "pv_latency_kick", - [qstat_pv_latency_wake] = "pv_latency_wake", - [qstat_pv_lock_stealing] = "pv_lock_stealing", - [qstat_pv_wait_again] = "pv_wait_again", - [qstat_pv_wait_early] = "pv_wait_early", - [qstat_pv_wait_head] = "pv_wait_head", - [qstat_pv_wait_node] = "pv_wait_node", - [qstat_lock_pending] = "lock_pending", - [qstat_lock_slowpath] = "lock_slowpath", - [qstat_lock_idx1] = "lock_index1", - [qstat_lock_idx2] = "lock_index2", - [qstat_lock_idx3] = "lock_index3", - [qstat_reset_cnts] = "reset_counters", -}; +#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev] /* - * Per-cpu counters + * PV specific per-cpu counter */ -static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); static DEFINE_PER_CPU(u64, pv_kick_time); /* - * Function to read and return the qlock statistical counter values + * Function to read and return the PV qspinlock counts. * * The following counters are handled specially: - * 1. qstat_pv_latency_kick + * 1. pv_latency_kick * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock - * 2. qstat_pv_latency_wake + * 2. pv_latency_wake * Average wake latency (ns) = pv_latency_wake/pv_kick_wake - * 3. qstat_pv_hash_hops + * 3. pv_hash_hops * Average hops/hash = pv_hash_hops/pv_kick_unlock */ -static ssize_t qstat_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[64]; - int cpu, counter, len; - u64 stat = 0, kicks = 0; + int cpu, id, len; + u64 sum = 0, kicks = 0; /* * Get the counter ID stored in file->f_inode->i_private */ - counter = (long)file_inode(file)->i_private; + id = (long)file_inode(file)->i_private; - if (counter >= qstat_num) + if (id >= lockevent_num) return -EBADF; for_each_possible_cpu(cpu) { - stat += per_cpu(qstats[counter], cpu); + sum += per_cpu(lockevents[id], cpu); /* - * Need to sum additional counter for some of them + * Need to sum additional counters for some of them */ - switch (counter) { + switch (id) { - case qstat_pv_latency_kick: - case qstat_pv_hash_hops: - kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); + case LOCKEVENT_pv_latency_kick: + case LOCKEVENT_pv_hash_hops: + kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu); break; - case qstat_pv_latency_wake: - kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); + case LOCKEVENT_pv_latency_wake: + kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu); break; } } - if (counter == qstat_pv_hash_hops) { + if (id == LOCKEVENT_pv_hash_hops) { u64 frac = 0; if (kicks) { - frac = 100ULL * do_div(stat, kicks); + frac = 100ULL * do_div(sum, kicks); frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); } /* * Return a X.XX decimal number */ - len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); + len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", + sum, frac); } else { /* * Round to the nearest ns */ - if ((counter == qstat_pv_latency_kick) || - (counter == qstat_pv_latency_wake)) { + if ((id == LOCKEVENT_pv_latency_kick) || + (id == LOCKEVENT_pv_latency_wake)) { if (kicks) - stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); + sum = DIV_ROUND_CLOSEST_ULL(sum, kicks); } - len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); } return simple_read_from_buffer(user_buf, count, ppos, buf, len); } /* - * Function to handle write request - * - * When counter = reset_cnts, reset all the counter values. - * Since the counter updates aren't atomic, the resetting is done twice - * to make sure that the counters are very likely to be all cleared. - */ -static ssize_t qstat_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - int cpu; - - /* - * Get the counter ID stored in file->f_inode->i_private - */ - if ((long)file_inode(file)->i_private != qstat_reset_cnts) - return count; - - for_each_possible_cpu(cpu) { - int i; - unsigned long *ptr = per_cpu_ptr(qstats, cpu); - - for (i = 0 ; i < qstat_num; i++) - WRITE_ONCE(ptr[i], 0); - } - return count; -} - -/* - * Debugfs data structures - */ -static const struct file_operations fops_qstat = { - .read = qstat_read, - .write = qstat_write, - .llseek = default_llseek, -}; - -/* - * Initialize debugfs for the qspinlock statistical counters - */ -static int __init init_qspinlock_stat(void) -{ - struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); - int i; - - if (!d_qstat) - goto out; - - /* - * Create the debugfs files - * - * As reading from and writing to the stat files can be slow, only - * root is allowed to do the read/write to limit impact to system - * performance. - */ - for (i = 0; i < qstat_num; i++) - if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, - (void *)(long)i, &fops_qstat)) - goto fail_undo; - - if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, - (void *)(long)qstat_reset_cnts, &fops_qstat)) - goto fail_undo; - - return 0; -fail_undo: - debugfs_remove_recursive(d_qstat); -out: - pr_warn("Could not create 'qlockstat' debugfs entries\n"); - return -ENOMEM; -} -fs_initcall(init_qspinlock_stat); - -/* - * Increment the PV qspinlock statistical counters - */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) -{ - if (cond) - this_cpu_inc(qstats[stat]); -} - -/* * PV hash hop count */ -static inline void qstat_hop(int hopcnt) +static inline void lockevent_pv_hop(int hopcnt) { - this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); + this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt); } /* @@ -267,7 +119,7 @@ static inline void __pv_kick(int cpu) per_cpu(pv_kick_time, cpu) = start; pv_kick(cpu); - this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); + this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start); } /* @@ -280,18 +132,19 @@ static inline void __pv_wait(u8 *ptr, u8 val) *pkick_time = 0; pv_wait(ptr, val); if (*pkick_time) { - this_cpu_add(qstats[qstat_pv_latency_wake], + this_cpu_add(EVENT_COUNT(pv_latency_wake), sched_clock() - *pkick_time); - qstat_inc(qstat_pv_kick_wake, true); + lockevent_inc(pv_kick_wake); } } #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) -#else /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_LOCK_EVENT_COUNTS */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) { } -static inline void qstat_hop(int hopcnt) { } +static inline void lockevent_pv_hop(int hopcnt) { } -#endif /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_LOCK_EVENT_COUNTS */ diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c deleted file mode 100644 index a7ffb2a96ede..000000000000 --- a/kernel/locking/rwsem-spinlock.c +++ /dev/null @@ -1,339 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> - * - Derived also from comments by Linus - */ -#include <linux/rwsem.h> -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/export.h> - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ - int ret = 1; - unsigned long flags; - - if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->count != 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - } - return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->count = 0; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - * - the 'active count' _reached_ zero - * - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - int woken; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wakewrite) - /* Wake up a writer. Note that we do not grant it the - * lock - it will have to acquire it when it runs. */ - wake_up_process(waiter->task); - goto out; - } - - /* grant an infinite number of read locks to the front of the queue */ - woken = 0; - do { - struct list_head *next = waiter->list.next; - - list_del(&waiter->list); - tsk = waiter->task; - /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). - */ - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - woken++; - if (next == &sem->wait_list) - break; - waiter = list_entry(next, struct rwsem_waiter, list); - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - sem->count += woken; - - out: - return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ - struct rwsem_waiter *waiter; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - wake_up_process(waiter->task); - - return sem; -} - -/* - * get a read lock on the semaphore - */ -int __sched __down_read_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - goto out; - } - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(current); - - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait to be given the lock */ - for (;;) { - if (!waiter.task) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - out: - return 0; - -out_nolock: - /* - * We didn't take the lock, so that there is a writer, which - * is owner or the first waiter of the sem. If it's a waiter, - * it will be woken by current owner. Not need to wake anybody. - */ - list_del(&waiter.list); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - return -EINTR; -} - -void __sched __down_read(struct rw_semaphore *sem) -{ - __down_read_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_read_killable(struct rw_semaphore *sem) -{ - return __down_read_common(sem, TASK_KILLABLE); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * get a write lock on the semaphore - */ -int __sched __down_write_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait for someone to release the lock */ - for (;;) { - /* - * That is the key to support write lock stealing: allows the - * task already on CPU to get the lock soon rather than put - * itself into sleep and waiting for system woke it or someone - * else in the head of the wait list up. - */ - if (sem->count == 0) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - /* got the lock */ - sem->count = -1; - list_del(&waiter.list); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; - -out_nolock: - list_del(&waiter.list); - if (!list_empty(&sem->wait_list) && sem->count >= 0) - __rwsem_do_wake(sem, 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return -EINTR; -} - -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_write_killable(struct rw_semaphore *sem) -{ - return __down_write_common(sem, TASK_KILLABLE); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count == 0) { - /* got the lock */ - sem->count = -1; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->count == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 50d9af615dc4..6b3ee9948bf1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * will notice the queued writer. */ wake_q_add(wake_q, waiter->task); + lockevent_inc(rwsem_wake_writer); } return; @@ -176,9 +177,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, goto try_reader_grant; } /* - * It is not really necessary to set it to reader-owned here, - * but it gives the spinners an early indication that the - * readers now have the lock. + * Set it to reader-owned to give spinners an early + * indication that readers now have the lock. */ __rwsem_set_reader_owned(sem, waiter->task); } @@ -211,12 +211,11 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * Ensure issuing the wakeup (either by us or someone else) * after setting the reader waiter to nil. */ - wake_q_add(wake_q, tsk); - /* wake_q_add() already take the task ref */ - put_task_struct(tsk); + wake_q_add_safe(wake_q, tsk); } adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); if (list_empty(&sem->wait_list)) { /* hit end of list above */ adjustment -= RWSEM_WAITING_BIAS; @@ -227,92 +226,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, } /* - * Wait for the read lock to be granted - */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) -{ - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; - struct rwsem_waiter waiter; - DEFINE_WAKE_Q(wake_q); - - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { - /* - * In case the wait queue is empty and the lock isn't owned - * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_ACTIVE_READ_BIAS has already - * been set in the count. - */ - if (atomic_long_read(&sem->count) >= 0) { - raw_spin_unlock_irq(&sem->wait_lock); - return sem; - } - adjustment += RWSEM_WAITING_BIAS; - } - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = atomic_long_add_return(adjustment, &sem->count); - - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - - /* wait to be given the lock */ - while (true) { - set_current_state(state); - if (!waiter.task) - break; - if (signal_pending_state(state, current)) { - raw_spin_lock_irq(&sem->wait_lock); - if (waiter.task) - goto out_nolock; - raw_spin_unlock_irq(&sem->wait_lock); - break; - } - schedule(); - } - - __set_current_state(TASK_RUNNING); - return sem; -out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); - raw_spin_unlock_irq(&sem->wait_lock); - __set_current_state(TASK_RUNNING); - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); - -/* * This function must be called with the sem->wait_lock held to prevent * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. @@ -348,21 +261,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = atomic_long_read(&sem->count); - - while (true) { - if (!(count == 0 || count == RWSEM_WAITING_BIAS)) - return false; + long count = atomic_long_read(&sem->count); - old = atomic_long_cmpxchg_acquire(&sem->count, count, - count + RWSEM_ACTIVE_WRITE_BIAS); - if (old == count) { + while (!count || count == RWSEM_WAITING_BIAS) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, + count + RWSEM_ACTIVE_WRITE_BIAS)) { rwsem_set_owner(sem); + lockevent_inc(rwsem_opt_wlock); return true; } - - count = old; } + return false; } static inline bool owner_on_cpu(struct task_struct *owner) @@ -483,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) osq_unlock(&sem->osq); done: preempt_enable(); + lockevent_cond_inc(rwsem_opt_fail, !taken); return taken; } @@ -507,6 +417,97 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem) #endif /* + * Wait for the read lock to be granted + */ +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) +{ + long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; + struct rwsem_waiter waiter; + DEFINE_WAKE_Q(wake_q); + + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_READ; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) { + /* + * In case the wait queue is empty and the lock isn't owned + * by a writer, this reader can exit the slowpath and return + * immediately as its RWSEM_ACTIVE_READ_BIAS has already + * been set in the count. + */ + if (atomic_long_read(&sem->count) >= 0) { + raw_spin_unlock_irq(&sem->wait_lock); + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_fast); + return sem; + } + adjustment += RWSEM_WAITING_BIAS; + } + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = atomic_long_add_return(adjustment, &sem->count); + + /* + * If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (count == RWSEM_WAITING_BIAS || + (count > RWSEM_WAITING_BIAS && + adjustment != -RWSEM_ACTIVE_READ_BIAS)) + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + + /* wait to be given the lock */ + while (true) { + set_current_state(state); + if (!waiter.task) + break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } + schedule(); + lockevent_inc(rwsem_sleep_reader); + } + + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock); + return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock_fail); + return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed); + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ + return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + +/* * Wait until we successfully acquire the write lock */ static inline struct rw_semaphore * @@ -582,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) goto out_nolock; schedule(); + lockevent_inc(rwsem_sleep_writer); set_current_state(state); } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); @@ -590,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) __set_current_state(TASK_RUNNING); list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); + lockevent_inc(rwsem_wlock); return ret; @@ -603,6 +606,7 @@ out_nolock: __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); + lockevent_inc(rwsem_wlock_fail); return ERR_PTR(-EINTR); } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e586f0d03ad3..ccbf18f560ff 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); - rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read); @@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem) return -EINTR; } - rwsem_set_reader_owned(sem); return 0; } @@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) { + if (ret == 1) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); - rwsem_set_reader_owned(sem); - } return ret; } @@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem) rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write); @@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem) return -EINTR; } - rwsem_set_owner(sem); return 0; } @@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) { + if (ret == 1) rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); - rwsem_set_owner(sem); - } return ret; } @@ -117,9 +109,7 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); - rwsem_clear_reader_owned(sem); __up_read(sem); } @@ -131,9 +121,7 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(sem->owner != current); - rwsem_clear_owner(sem); __up_write(sem); } @@ -145,9 +133,7 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(sem->owner != current); - rwsem_set_reader_owned(sem); __downgrade_write(sem); } @@ -161,7 +147,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); - rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_nested); @@ -172,7 +157,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } EXPORT_SYMBOL(_down_write_nest_lock); @@ -193,7 +177,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } EXPORT_SYMBOL(down_write_nested); @@ -208,7 +191,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) return -EINTR; } - rwsem_set_owner(sem); return 0; } @@ -216,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), + sem); __up_read(sem); } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index bad2bca0268b..64877f5294e3 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -23,15 +23,44 @@ * is involved. Ideally we would like to track all the readers that own * a rwsem, but the overhead is simply too big. */ +#include "lock_events.h" + #define RWSEM_READER_OWNED (1UL << 0) #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) #ifdef CONFIG_DEBUG_RWSEMS -# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) +# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ + if (!debug_locks_silent && \ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + #c, atomic_long_read(&(sem)->count), \ + (long)((sem)->owner), (long)current, \ + list_empty(&(sem)->wait_list) ? "" : "not ")) \ + debug_locks_off(); \ + } while (0) +#else +# define DEBUG_RWSEMS_WARN_ON(c, sem) +#endif + +/* + * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. + * Adapted largely from include/asm-i386/rwsem.h + * by Paul Mackerras <paulus@samba.org>. + */ + +/* + * the semaphore definition + */ +#ifdef CONFIG_64BIT +# define RWSEM_ACTIVE_MASK 0xffffffffL #else -# define DEBUG_RWSEMS_WARN_ON(c) +# define RWSEM_ACTIVE_MASK 0x0000ffffL #endif +#define RWSEM_ACTIVE_BIAS 0x00000001L +#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) +#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS +#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -132,3 +161,144 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { } #endif + +extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); + +/* + * lock for reading + */ +static inline void __down_read(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + rwsem_down_read_failed(sem); + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & + RWSEM_READER_OWNED), sem); + } else { + rwsem_set_reader_owned(sem); + } +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { + if (IS_ERR(rwsem_down_read_failed_killable(sem))) + return -EINTR; + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & + RWSEM_READER_OWNED), sem); + } else { + rwsem_set_reader_owned(sem); + } + return 0; +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + /* + * Optimize for the case when the rwsem is not locked at all. + */ + long tmp = RWSEM_UNLOCKED_VALUE; + + lockevent_inc(rwsem_rtrylock); + do { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + tmp + RWSEM_ACTIVE_READ_BIAS)) { + rwsem_set_reader_owned(sem); + return 1; + } + } while (tmp >= 0); + return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + rwsem_down_write_failed(sem); + rwsem_set_owner(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + long tmp; + + tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count); + if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) + if (IS_ERR(rwsem_down_write_failed_killable(sem))) + return -EINTR; + rwsem_set_owner(sem); + return 0; +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + long tmp; + + lockevent_inc(rwsem_wtrylock); + tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); + if (tmp == RWSEM_UNLOCKED_VALUE) { + rwsem_set_owner(sem); + return true; + } + return false; +} + +/* + * unlock after reading + */ +static inline void __up_read(struct rw_semaphore *sem) +{ + long tmp; + + DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), + sem); + rwsem_clear_reader_owned(sem); + tmp = atomic_long_dec_return_release(&sem->count); + if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) + rwsem_wake(sem); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + rwsem_clear_owner(sem); + if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, + &sem->count) < 0)) + rwsem_wake(sem); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * When downgrading from exclusive to shared ownership, + * anything inside the write-locked region cannot leak + * into the read side. In contrast, anything in the + * read-locked region is ok to be re-ordered into the + * write side. As such, rely on RELEASE semantics. + */ + DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); + tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); + rwsem_set_reader_owned(sem); + if (tmp < 0) + rwsem_downgrade_wake(sem); +} diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 936f3d14dd6b..0ff08380f531 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -22,6 +22,13 @@ #include <linux/debug_locks.h> #include <linux/export.h> +#ifdef CONFIG_MMIOWB +#ifndef arch_mmiowb_state +DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state); +EXPORT_PER_CPU_SYMBOL(__mmiowb_state); +#endif +#endif + /* * If lockdep is enabled then we use the non-preemption spin-ops * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 9aa0fccd5d43..399669f7eba8 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock) { debug_spin_lock_before(lock); arch_spin_lock(&lock->raw_lock); + mmiowb_spin_lock(); debug_spin_lock_after(lock); } @@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&lock->raw_lock); - if (ret) + if (ret) { + mmiowb_spin_lock(); debug_spin_lock_after(lock); + } #ifndef CONFIG_SMP /* * Must not happen on UP: @@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) void do_raw_spin_unlock(raw_spinlock_t *lock) { + mmiowb_spin_unlock(); debug_spin_unlock(lock); arch_spin_unlock(&lock->raw_lock); } diff --git a/kernel/module.c b/kernel/module.c index 2ad1b5239910..a9020bdd4cf6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); +/* Work queue for freeing init sections in success case */ +static struct work_struct init_free_wq; +static struct llist_head init_free_list; + #ifdef CONFIG_MODULES_TREE_LOOKUP /* @@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init) if (!rodata_enabled) return; + set_vm_flush_reset_perms(mod->core_layout.base); + set_vm_flush_reset_perms(mod->init_layout.base); frob_text(&mod->core_layout, set_memory_ro); + frob_text(&mod->core_layout, set_memory_x); + frob_rodata(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_x); + frob_rodata(&mod->init_layout, set_memory_ro); if (after_init) @@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } -static void module_disable_nx(const struct module *mod) -{ - frob_rodata(&mod->core_layout, set_memory_x); - frob_ro_after_init(&mod->core_layout, set_memory_x); - frob_writable_data(&mod->core_layout, set_memory_x); - frob_rodata(&mod->init_layout, set_memory_x); - frob_writable_data(&mod->init_layout, set_memory_x); -} - /* Iterate through all modules and set each module's text as RW */ void set_all_modules_text_rw(void) { @@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void) } mutex_unlock(&module_mutex); } - -static void disable_ro_nx(const struct module_layout *layout) -{ - if (rodata_enabled) { - frob_text(layout, set_memory_rw); - frob_rodata(layout, set_memory_rw); - frob_ro_after_init(layout, set_memory_rw); - } - frob_rodata(layout, set_memory_x); - frob_ro_after_init(layout, set_memory_x); - frob_writable_data(layout, set_memory_x); -} - #else -static void disable_ro_nx(const struct module_layout *layout) { } static void module_enable_nx(const struct module *mod) { } -static void module_disable_nx(const struct module *mod) { } #endif #ifdef CONFIG_LIVEPATCH @@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod) void __weak module_memfree(void *module_region) { + /* + * This memory may be RO, and freeing RO memory in an interrupt is not + * supported by vmalloc. + */ + WARN_ON(in_interrupt()); vfree(module_region); } @@ -2166,7 +2158,6 @@ static void free_module(struct module *mod) mutex_unlock(&module_mutex); /* This may be empty, but that's OK */ - disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); kfree(mod->args); @@ -2176,7 +2167,6 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ - disable_ro_nx(&mod->core_layout); module_memfree(mod->core_layout.base); } @@ -2719,11 +2709,7 @@ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsig { if (!debug) return; -#ifdef CONFIG_DYNAMIC_DEBUG - if (ddebug_add_module(debug, num, mod->name)) - pr_err("dynamic debug error adding module: %s\n", - debug->modname); -#endif + ddebug_add_module(debug, num, mod->name); } static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) @@ -3419,17 +3405,34 @@ static void do_mod_ctors(struct module *mod) /* For freeing module_init on success, in case kallsyms traversing */ struct mod_initfree { - struct rcu_head rcu; + struct llist_node node; void *module_init; }; -static void do_free_init(struct rcu_head *head) +static void do_free_init(struct work_struct *w) { - struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); - module_memfree(m->module_init); - kfree(m); + struct llist_node *pos, *n, *list; + struct mod_initfree *initfree; + + list = llist_del_all(&init_free_list); + + synchronize_rcu(); + + llist_for_each_safe(pos, n, list) { + initfree = container_of(pos, struct mod_initfree, node); + module_memfree(initfree->module_init); + kfree(initfree); + } } +static int __init modules_wq_init(void) +{ + INIT_WORK(&init_free_wq, do_free_init); + init_llist_head(&init_free_list); + return 0; +} +module_init(modules_wq_init); + /* * This is where the real work happens. * @@ -3506,7 +3509,6 @@ static noinline int do_init_module(struct module *mod) #endif module_enable_ro(mod, true); mod_tree_remove_init(mod); - disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); mod->init_layout.base = NULL; mod->init_layout.size = 0; @@ -3517,14 +3519,18 @@ static noinline int do_init_module(struct module *mod) * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we * call synchronize_rcu(), but we don't want to slow down the success - * path, so use actual RCU here. + * path. module_memfree() cannot be called in an interrupt, so do the + * work and call synchronize_rcu() in a work queue. + * * Note that module_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to * be cleaned up needs to sync with the queued work - ie * rcu_barrier() */ - call_rcu(&freeinit->rcu, do_free_init); + if (llist_add(&freeinit->node, &init_free_list)) + schedule_work(&init_free_wq); + mutex_unlock(&module_mutex); wake_up_all(&module_wq); @@ -3821,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs, module_bug_cleanup(mod); mutex_unlock(&module_mutex); - /* we can't deallocate the module until we clear memory protection */ - module_disable_ro(mod); - module_disable_nx(mod); - ddebug_cleanup: ftrace_release_mod(mod); dynamic_debug_remove(mod, info->debug); diff --git a/kernel/padata.c b/kernel/padata.c index 3e2633ae3bca..2d2fddbb7a4c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = { ¶llel_cpumask_attr.attr, NULL, }; +ATTRIBUTE_GROUPS(padata_default); static ssize_t padata_sysfs_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = { static struct kobj_type padata_attr_type = { .sysfs_ops = &padata_sysfs_ops, - .default_attrs = padata_default_attrs, + .default_groups = padata_default_groups, .release = padata_sysfs_release, }; diff --git a/kernel/panic.c b/kernel/panic.c index f121e6ba7e11..c1fcaad337b7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -318,12 +318,7 @@ void panic(const char *fmt, ...) } #endif #if defined(CONFIG_S390) - { - unsigned long caller; - - caller = (unsigned long)__builtin_return_address(0); - disabled_wait(caller); - } + disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); local_irq_enable(); @@ -642,16 +637,14 @@ static int clear_warn_once_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, - NULL, - clear_warn_once_set, - "%lld\n"); +DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set, + "%lld\n"); static __init int register_warn_debugfs(void) { /* Don't care about failure */ - debugfs_create_file("clear_warn_once", 0200, NULL, - NULL, &clear_warn_once_fops); + debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL, + &clear_warn_once_fops); return 0; } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index f8fe57d1022e..9bbaaab14b36 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -114,6 +114,15 @@ config PM_SLEEP_SMP depends on PM_SLEEP select HOTPLUG_CPU +config PM_SLEEP_SMP_NONZERO_CPU + def_bool y + depends on PM_SLEEP_SMP + depends on ARCH_SUSPEND_NONZERO_CPU + ---help--- + If an arch can suspend (for suspend, hibernate, kexec, etc) on a + non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This + will allow nohz_full mask to include CPU0. + config PM_AUTOSLEEP bool "Opportunistic sleep" depends on PM_SLEEP diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d9dc2c38764a..7d66ee68aaaf 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -10,6 +10,7 @@ #include <linux/cpu.h> #include <linux/cpumask.h> +#include <linux/debugfs.h> #include <linux/energy_model.h> #include <linux/sched/topology.h> #include <linux/slab.h> @@ -23,6 +24,60 @@ static DEFINE_PER_CPU(struct em_perf_domain *, em_data); */ static DEFINE_MUTEX(em_pd_mutex); +#ifdef CONFIG_DEBUG_FS +static struct dentry *rootdir; + +static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd) +{ + struct dentry *d; + char name[24]; + + snprintf(name, sizeof(name), "cs:%lu", cs->frequency); + + /* Create per-cs directory */ + d = debugfs_create_dir(name, pd); + debugfs_create_ulong("frequency", 0444, d, &cs->frequency); + debugfs_create_ulong("power", 0444, d, &cs->power); + debugfs_create_ulong("cost", 0444, d, &cs->cost); +} + +static int em_debug_cpus_show(struct seq_file *s, void *unused) +{ + seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); + +static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) +{ + struct dentry *d; + char name[8]; + int i; + + snprintf(name, sizeof(name), "pd%d", cpu); + + /* Create the directory of the performance domain */ + d = debugfs_create_dir(name, rootdir); + + debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); + + /* Create a sub-directory for each capacity state */ + for (i = 0; i < pd->nr_cap_states; i++) + em_debug_create_cs(&pd->table[i], d); +} + +static int __init em_debug_init(void) +{ + /* Create /sys/kernel/debug/energy_model directory */ + rootdir = debugfs_create_dir("energy_model", NULL); + + return 0; +} +core_initcall(em_debug_init); +#else /* CONFIG_DEBUG_FS */ +static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} +#endif static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, struct em_data_callback *cb) { @@ -102,6 +157,8 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, pd->nr_cap_states = nr_states; cpumask_copy(to_cpumask(pd->cpus), span); + em_debug_create_pd(pd, cpu); + return pd; free_cs_table: diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index abef759de7c8..c8c272df7154 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -14,7 +14,6 @@ #include <linux/export.h> #include <linux/suspend.h> -#include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> @@ -281,7 +280,7 @@ static int create_image(int platform_mode) if (error || hibernation_test(TEST_PLATFORM)) goto Platform_finish; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; @@ -323,7 +322,7 @@ static int create_image(int platform_mode) local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Platform_finish: platform_finish(platform_mode); @@ -417,7 +416,7 @@ int hibernation_snapshot(int platform_mode) int __weak hibernate_resume_nonboot_cpu_disable(void) { - return disable_nonboot_cpus(); + return suspend_disable_secondary_cpus(); } /** @@ -486,7 +485,7 @@ static int resume_target_kernel(bool platform_mode) local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Cleanup: platform_restore_cleanup(platform_mode); @@ -564,7 +563,7 @@ int hibernation_platform_enter(void) if (error) goto Platform_finish; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; @@ -586,7 +585,7 @@ int hibernation_platform_enter(void) local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Platform_finish: hibernation_ops->finish(); @@ -709,9 +708,7 @@ int hibernate(void) goto Exit; } - pr_info("Syncing filesystems ... \n"); - ksys_sync(); - pr_info("done.\n"); + ksys_sync_helper(); error = freeze_processes(); if (error) diff --git a/kernel/power/main.c b/kernel/power/main.c index 98e76cad128b..4f43e724f6eb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -16,6 +16,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/suspend.h> +#include <linux/syscalls.h> #include "power.h" @@ -51,6 +52,19 @@ void unlock_system_sleep(void) } EXPORT_SYMBOL_GPL(unlock_system_sleep); +void ksys_sync_helper(void) +{ + ktime_t start; + long elapsed_msecs; + + start = ktime_get(); + ksys_sync(); + elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); + pr_info("Filesystems sync: %ld.%03ld seconds\n", + elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); +} +EXPORT_SYMBOL_GPL(ksys_sync_helper); + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); diff --git a/kernel/power/qos.c b/kernel/power/qos.c index b7a82502857a..9d22131afc1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -582,10 +582,8 @@ static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d) qos->pm_qos_power_miscdev.name = qos->name; qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - if (d) { - (void)debugfs_create_file(qos->name, S_IRUGO, d, - (void *)qos, &pm_qos_debug_fops); - } + debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos, + &pm_qos_debug_fops); return misc_register(&qos->pm_qos_power_miscdev); } @@ -685,8 +683,6 @@ static int __init pm_qos_power_init(void) BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); d = debugfs_create_dir("pm_qos", NULL); - if (IS_ERR_OR_NULL(d)) - d = NULL; for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { ret = register_pm_qos_misc(pm_qos_array[i], d); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 640b2034edd6..bc9558ab1e5b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -965,6 +965,9 @@ void __init __register_nosave_region(unsigned long start_pfn, /* This allocation cannot fail */ region = memblock_alloc(sizeof(struct nosave_region), SMP_CACHE_BYTES); + if (!region) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(struct nosave_region)); } region->start_pfn = start_pfn; region->end_pfn = end_pfn; @@ -1215,14 +1218,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(!PageHighMem(page)); - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) || PageOffline(page)) return NULL; if (page_is_guard(page)) @@ -1277,8 +1282,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(PageHighMem(page)); @@ -1286,6 +1291,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; + if (PageOffline(page)) + return NULL; + if (PageReserved(page) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; @@ -1334,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src) * safe_copy_page - Copy a page in a safe way. * * Check if the page we are going to copy is marked as present in the kernel - * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set - * and in that case kernel_page_present() always returns 'true'). + * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or + * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() + * always returns 'true'. */ static void safe_copy_page(void *dst, struct page *s_page) { diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0bd595a0b610..ef908c134b34 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -17,7 +17,6 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/cpuidle.h> -#include <linux/syscalls.h> #include <linux/gfp.h> #include <linux/io.h> #include <linux/kernel.h> @@ -428,7 +427,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_test(TEST_PLATFORM)) goto Platform_wake; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -458,7 +457,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) BUG_ON(irqs_disabled()); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Platform_wake: platform_resume_noirq(state); @@ -568,13 +567,11 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_TO_IDLE) s2idle_begin(); -#ifndef CONFIG_SUSPEND_SKIP_SYNC - trace_suspend_resume(TPS("sync_filesystems"), 0, true); - pr_info("Syncing filesystems ... "); - ksys_sync(); - pr_cont("done.\n"); - trace_suspend_resume(TPS("sync_filesystems"), 0, false); -#endif + if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) { + trace_suspend_resume(TPS("sync_filesystems"), 0, true); + ksys_sync_helper(); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); + } pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); pm_suspend_clear_flags(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 2d8b60a3c86b..cb24e840a3e6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -10,7 +10,6 @@ */ #include <linux/suspend.h> -#include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> @@ -228,9 +227,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (data->frozen) break; - printk("Syncing filesystems ... "); - ksys_sync(); - printk("done.\n"); + ksys_sync_helper(); error = freeze_processes(); if (error) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d3d170374ceb..02ca827b8fac 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -65,6 +65,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; +EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); @@ -344,7 +345,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; enum log_flags { LOG_NEWLINE = 2, /* text ended with a newline */ - LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ }; @@ -356,6 +356,9 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ +#ifdef CONFIG_PRINTK_CALLER + u32 caller_id; /* thread id or processor id */ +#endif } #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS __packed __aligned(4) @@ -422,7 +425,11 @@ static u64 exclusive_console_stop_seq; static u64 clear_seq; static u32 clear_idx; +#ifdef CONFIG_PRINTK_CALLER +#define PREFIX_MAX 48 +#else #define PREFIX_MAX 32 +#endif #define LOG_LINE_MAX (1024 - PREFIX_MAX) #define LOG_LEVEL(v) ((v) & 0x07) @@ -577,7 +584,7 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, } /* insert record into the buffer, discard old ones, update heads */ -static int log_store(int facility, int level, +static int log_store(u32 caller_id, int facility, int level, enum log_flags flags, u64 ts_nsec, const char *dict, u16 dict_len, const char *text, u16 text_len) @@ -625,6 +632,9 @@ static int log_store(int facility, int level, msg->ts_nsec = ts_nsec; else msg->ts_nsec = local_clock(); +#ifdef CONFIG_PRINTK_CALLER + msg->caller_id = caller_id; +#endif memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = size; @@ -688,12 +698,21 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, struct printk_log *msg, u64 seq) { u64 ts_usec = msg->ts_nsec; + char caller[20]; +#ifdef CONFIG_PRINTK_CALLER + u32 id = msg->caller_id; + + snprintf(caller, sizeof(caller), ",caller=%c%u", + id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); +#else + caller[0] = '\0'; +#endif do_div(ts_usec, 1000); - return scnprintf(buf, size, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, seq, ts_usec, - msg->flags & LOG_CONT ? 'c' : '-'); + return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", + (msg->facility << 3) | msg->level, seq, ts_usec, + msg->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_print_ext_body(char *buf, size_t size, @@ -1038,6 +1057,9 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_OFFSET(printk_log, len); VMCOREINFO_OFFSET(printk_log, text_len); VMCOREINFO_OFFSET(printk_log, dict_len); +#ifdef CONFIG_PRINTK_CALLER + VMCOREINFO_OFFSET(printk_log, caller_id); +#endif } #endif @@ -1122,14 +1144,7 @@ void __init setup_log_buf(int early) if (!new_log_buf_len) return; - if (early) { - new_log_buf = - memblock_alloc(new_log_buf_len, LOG_ALIGN); - } else { - new_log_buf = memblock_alloc_nopanic(new_log_buf_len, - LOG_ALIGN); - } - + new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu bytes not available\n", new_log_buf_len); @@ -1236,10 +1251,23 @@ static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); - return sprintf(buf, "[%5lu.%06lu] ", + return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } +#ifdef CONFIG_PRINTK_CALLER +static size_t print_caller(u32 id, char *buf) +{ + char caller[12]; + + snprintf(caller, sizeof(caller), "%c%u", + id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); + return sprintf(buf, "[%6s]", caller); +} +#else +#define print_caller(id, buf) 0 +#endif + static size_t print_prefix(const struct printk_log *msg, bool syslog, bool time, char *buf) { @@ -1247,8 +1275,17 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, if (syslog) len = print_syslog((msg->facility << 3) | msg->level, buf); + if (time) len += print_time(msg->ts_nsec, buf + len); + + len += print_caller(msg->caller_id, buf + len); + + if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { + buf[len++] = ' '; + buf[len] = '\0'; + } + return len; } @@ -1752,6 +1789,12 @@ static inline void printk_delay(void) } } +static inline u32 printk_caller_id(void) +{ + return in_task() ? task_pid_nr(current) : + 0x80000000 + raw_smp_processor_id(); +} + /* * Continuation lines are buffered, and not committed to the record buffer * until the line is complete, or a race forces it. The line fragments @@ -1761,7 +1804,7 @@ static inline void printk_delay(void) static struct cont { char buf[LOG_LINE_MAX]; size_t len; /* length == 0 means unused buffer */ - struct task_struct *owner; /* task of first print*/ + u32 caller_id; /* printk_caller_id() of first print */ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log facility of first message */ @@ -1773,12 +1816,13 @@ static void cont_flush(void) if (cont.len == 0) return; - log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec, - NULL, 0, cont.buf, cont.len); + log_store(cont.caller_id, cont.facility, cont.level, cont.flags, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); cont.len = 0; } -static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) +static bool cont_add(u32 caller_id, int facility, int level, + enum log_flags flags, const char *text, size_t len) { /* If the line gets too long, split it up in separate records. */ if (cont.len + len > sizeof(cont.buf)) { @@ -1789,7 +1833,7 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * if (!cont.len) { cont.facility = facility; cont.level = level; - cont.owner = current; + cont.caller_id = caller_id; cont.ts_nsec = local_clock(); cont.flags = flags; } @@ -1809,13 +1853,15 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) { + const u32 caller_id = printk_caller_id(); + /* * If an earlier line was buffered, and we're a continuation - * write from the same process, try to add it to the buffer. + * write from the same context, try to add it to the buffer. */ if (cont.len) { - if (cont.owner == current && (lflags & LOG_CONT)) { - if (cont_add(facility, level, lflags, text, text_len)) + if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { + if (cont_add(caller_id, facility, level, lflags, text, text_len)) return text_len; } /* Otherwise, make sure it's flushed */ @@ -1828,12 +1874,13 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c /* If it doesn't end in a newline, try to buffer the current line */ if (!(lflags & LOG_NEWLINE)) { - if (cont_add(facility, level, lflags, text, text_len)) + if (cont_add(caller_id, facility, level, lflags, text, text_len)) return text_len; } /* Store it in the record log */ - return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); + return log_store(caller_id, facility, level, lflags, 0, + dict, dictlen, text, text_len); } /* Must be called under logbuf_lock. */ @@ -1867,9 +1914,6 @@ int vprintk_store(int facility, int level, case '0' ... '7': if (level == LOGLEVEL_DEFAULT) level = kern_level - '0'; - /* fallthrough */ - case 'd': /* KERN_DEFAULT */ - lflags |= LOG_PREFIX; break; case 'c': /* KERN_CONT */ lflags |= LOG_CONT; @@ -1884,7 +1928,7 @@ int vprintk_store(int facility, int level, level = default_message_loglevel; if (dict) - lflags |= LOG_PREFIX|LOG_NEWLINE; + lflags |= LOG_NEWLINE; return log_output(facility, level, lflags, dict, dictlen, text, text_len); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 771e93f9c43f..6f357f4fc859 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -29,6 +29,7 @@ #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> #include <linux/compat.h> +#include <linux/sched/signal.h> /* * Access another process' address space via ptrace. @@ -924,18 +925,26 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; - case PTRACE_GETSIGMASK: + case PTRACE_GETSIGMASK: { + sigset_t *mask; + if (addr != sizeof(sigset_t)) { ret = -EINVAL; break; } - if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) + if (test_tsk_restore_sigmask(child)) + mask = &child->saved_sigmask; + else + mask = &child->blocked; + + if (copy_to_user(datavp, mask, sizeof(sigset_t))) ret = -EFAULT; else ret = 0; break; + } case PTRACE_SETSIGMASK: { sigset_t new_set; @@ -961,6 +970,8 @@ int ptrace_request(struct task_struct *child, long request, child->blocked = new_set; spin_unlock_irq(&child->sighand->siglock); + clear_tsk_restore_sigmask(child); + ret = 0; break; } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 939a2056c87a..37301430970e 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -87,36 +87,6 @@ config RCU_STALL_COMMON config RCU_NEED_SEGCBLIST def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) -config CONTEXT_TRACKING - bool - -config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING - default y if !NO_HZ_FULL - help - The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also - other dependencies to provide in order to make the full - dynticks working. - - This option stands for testing when an arch implements the - context tracking backend but doesn't yet fullfill all the - requirements to make the full dynticks feature working. - Without the full dynticks, there is no way to test the support - for context tracking and the subsystems that rely on it: RCU - userspace extended quiescent state and tickless cputime - accounting. This option copes with the absence of the full - dynticks subsystem by forcing the context tracking on all - CPUs in the system. - - Say Y only if you're working on the development of an - architecture backend for the context tracking. - - Say N otherwise, this option brings an overhead that you - don't want in production. - - config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index a393e24a9195..4b58c907b4b7 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update definitions shared among RCU implementations. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2011 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ #ifndef __LINUX_RCU_H @@ -30,7 +17,7 @@ #define RCU_TRACE(stmt) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ +/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) @@ -246,6 +233,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_suppress; +extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); #define rcu_ftrace_dump_stall_suppress() \ @@ -462,8 +450,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, - RCU_BH_FLAVOR, - RCU_SCHED_FLAVOR, RCU_TASKS_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 5aff271adf1e..9bd5f6023c21 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * RCU segmented callback lists, function definitions * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/types.h> diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 948470cef385..71b64648464e 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU segmented callback lists, internal-to-rcu header file * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/rcu_segcblist.h> diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index b459da70b4fc..7a6890b23c5f 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update module-based performance-test facility * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2015 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #define pr_fmt(fmt) fmt @@ -54,7 +41,7 @@ #include "rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); #define PERF_FLAG "-perf:" #define PERFOUT_STRING(s) \ @@ -83,13 +70,19 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); * Various other use cases may of course be specified. */ +#ifdef MODULE +# define RCUPERF_SHUTDOWN 0 +#else +# define RCUPERF_SHUTDOWN 1 +#endif + torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, !IS_ENABLED(MODULE), +torture_param(bool, shutdown, RCUPERF_SHUTDOWN, "Shutdown at end of performance tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); @@ -501,6 +494,10 @@ rcu_perf_cleanup(void) if (torture_cleanup_begin()) return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } if (reader_tasks) { for (i = 0; i < nrealreaders; i++) @@ -621,6 +618,7 @@ rcu_perf_init(void) pr_cont("\n"); WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); firsterr = -EINVAL; + cur_ops = NULL; goto unwind; } if (cur_ops->init) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f6e85faa4ff4..efaa5b3f4d3f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update module-based torture test facility * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2005, 2006 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Josh Triplett <josh@joshtriplett.org> * * See also: Documentation/RCU/torture.txt @@ -61,7 +48,7 @@ #include "rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); /* Bits for ->extendables field, extendables param, and related definitions. */ @@ -312,7 +299,6 @@ struct rcu_torture_ops { int irq_capable; int can_boost; int extendables; - int ext_irq_conflict; const char *name; }; @@ -605,12 +591,7 @@ static void srcu_torture_init(void) static void srcu_torture_cleanup(void) { - static DEFINE_TORTURE_RANDOM(rand); - - if (torture_random(&rand) & 0x800) - cleanup_srcu_struct(&srcu_ctld); - else - cleanup_srcu_struct_quiesced(&srcu_ctld); + cleanup_srcu_struct(&srcu_ctld); srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ } @@ -1173,7 +1154,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) unsigned long randmask2 = randmask1 >> 3; WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); - /* Most of the time lots of bits, half the time only one bit. */ + /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; else @@ -1183,10 +1164,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) || (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH)))) mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - if ((mask & RCUTORTURE_RDR_IRQ) && - !(mask & cur_ops->ext_irq_conflict) && - (oldmask & cur_ops->ext_irq_conflict)) - mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */ return mask ?: RCUTORTURE_RDR_RCU; } @@ -1630,21 +1607,34 @@ static bool rcu_fwd_emergency_stop; #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ #define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ -static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)]; +struct rcu_launder_hist { + long n_launders; + unsigned long launder_gp_seq; +}; +#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) +static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; +static unsigned long rcu_launder_gp_seq_start; static void rcu_torture_fwd_cb_hist(void) { + unsigned long gps; + unsigned long gps_old; int i; int j; for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i] > 0) + if (n_launders_hist[i].n_launders > 0) break; pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", __func__, jiffies - rcu_fwd_startat); - for (j = 0; j <= i; j++) - pr_cont(" %ds/%d: %ld", - j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]); + gps_old = rcu_launder_gp_seq_start; + for (j = 0; j <= i; j++) { + gps = n_launders_hist[j].launder_gp_seq; + pr_cont(" %ds/%d: %ld:%ld", + j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders, + rcutorture_seq_diff(gps, gps_old)); + gps_old = gps; + } pr_cont("\n"); } @@ -1666,7 +1656,8 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(n_launders_hist)) i = ARRAY_SIZE(n_launders_hist) - 1; - n_launders_hist[i]++; + n_launders_hist[i].n_launders++; + n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); spin_unlock_irqrestore(&rcu_fwd_lock, flags); } @@ -1786,9 +1777,10 @@ static void rcu_torture_fwd_prog_cr(void) n_max_cbs = 0; n_max_gps = 0; for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) - n_launders_hist[i] = 0; + n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); + rcu_launder_gp_seq_start = gps; while (time_before(jiffies, stopat) && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { rfcp = READ_ONCE(rcu_fwd_cb_head); @@ -1846,7 +1838,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ pr_info("%s: Freed %lu RCU callbacks.\n", @@ -2092,6 +2084,10 @@ rcu_torture_cleanup(void) cur_ops->cb_barrier(); return; } + if (!cur_ops) { + torture_cleanup_end(); + return; + } rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); @@ -2228,6 +2224,14 @@ static void rcu_test_debug_objects(void) #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } +static void rcutorture_sync(void) +{ + static unsigned long n; + + if (cur_ops->sync && !(++n & 0xfff)) + cur_ops->sync(); +} + static int __init rcu_torture_init(void) { @@ -2257,6 +2261,7 @@ rcu_torture_init(void) pr_cont("\n"); WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); firsterr = -EINVAL; + cur_ops = NULL; goto unwind; } if (cur_ops->fqs == NULL && fqs_duration != 0) { @@ -2389,7 +2394,8 @@ rcu_torture_init(void) firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; - firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval); + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, + rcutorture_sync); if (firsterr) goto unwind; firsterr = rcu_torture_stall_init(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 32dfd6522548..44d6606b8325 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Sleepable Read-Copy Update mechanism for mutual exclusion, * tiny version for non-preemptible single-CPU use. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2017 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> */ #include <linux/export.h> @@ -89,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +void cleanup_srcu_struct(struct srcu_struct *ssp) { WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); - if (quiesced) - WARN_ON(work_pending(&ssp->srcu_work)); - else - flush_work(&ssp->srcu_work); + flush_work(&ssp->srcu_work); WARN_ON(ssp->srcu_gp_running); WARN_ON(ssp->srcu_gp_waiting); WARN_ON(ssp->srcu_cb_head); WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Removes the count for the old reader from the appropriate element of diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3600d88d8956..9b761e546de8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Sleepable Read-Copy Update mechanism for mutual exclusion. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - @@ -58,6 +45,7 @@ static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); +static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ #define spin_lock_rcu_node(p) \ @@ -156,7 +144,8 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) snp->grphi = cpu; } sdp->cpu = cpu; - INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); if (is_static) @@ -371,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) return SRCU_INTERVAL; } -/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @ssp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; @@ -380,19 +375,15 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ - if (quiesced) { - if (WARN_ON(delayed_work_pending(&ssp->work))) - return; /* Just leak it! */ - } else { - flush_delayed_work(&ssp->work); + flush_delayed_work(&ssp->work); + for_each_possible_cpu(cpu) { + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); + + del_timer_sync(&sdp->delay_work); + flush_work(&sdp->work); + if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) + return; /* Forgot srcu_barrier(), so just leak it! */ } - for_each_possible_cpu(cpu) - if (quiesced) { - if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work))) - return; /* Just leak it! */ - } else { - flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work); - } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p state: %d\n", @@ -402,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) free_percpu(ssp->sda); ssp->sda = NULL; } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the @@ -463,39 +454,23 @@ static void srcu_gp_start(struct srcu_struct *ssp) WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } -/* - * Track online CPUs to guide callback workqueue placement. - */ -DEFINE_PER_CPU(bool, srcu_online); -void srcu_online_cpu(unsigned int cpu) +static void srcu_delay_timer(struct timer_list *t) { - WRITE_ONCE(per_cpu(srcu_online, cpu), true); -} + struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work); -void srcu_offline_cpu(unsigned int cpu) -{ - WRITE_ONCE(per_cpu(srcu_online, cpu), false); + queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); } -/* - * Place the workqueue handler on the specified CPU if online, otherwise - * just run it whereever. This is useful for placing workqueue handlers - * that are to invoke the specified CPU's callbacks. - */ -static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, +static void srcu_queue_delayed_work_on(struct srcu_data *sdp, unsigned long delay) { - bool ret; + if (!delay) { + queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); + return; + } - preempt_disable(); - if (READ_ONCE(per_cpu(srcu_online, cpu))) - ret = queue_delayed_work_on(cpu, wq, dwork, delay); - else - ret = queue_delayed_work(wq, dwork, delay); - preempt_enable(); - return ret; + timer_reduce(&sdp->delay_work, jiffies + delay); } /* @@ -504,7 +479,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); + srcu_queue_delayed_work_on(sdp, delay); } /* @@ -1186,7 +1161,8 @@ static void srcu_invoke_callbacks(struct work_struct *work) struct srcu_data *sdp; struct srcu_struct *ssp; - sdp = container_of(work, struct srcu_data, work.work); + sdp = container_of(work, struct srcu_data, work); + ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be10036fa621..a8304d90573f 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * RCU-based infrastructure for lightweight reader-writer locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (c) 2015, Red Hat, Inc. * * Author: Oleg Nesterov <oleg@redhat.com> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 5f5963ba313e..477b4eb44af5 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU @@ -65,7 +52,7 @@ void rcu_qs(void) local_irq_save(flags); if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; - raise_softirq(RCU_SOFTIRQ); + raise_softirq_irqoff(RCU_SOFTIRQ); } local_irq_restore(flags); } @@ -76,7 +63,7 @@ void rcu_qs(void) * be called from hardirq context. It is normally called from the * scheduling-clock interrupt. */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user) { if (user) { rcu_qs(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9180158756d2..b4d88a594785 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1,27 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version + * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - @@ -62,6 +49,8 @@ #include <linux/suspend.h> #include <linux/ftrace.h> #include <linux/tick.h> +#include <linux/sysrq.h> +#include <linux/kprobes.h> #include "tree.h" #include "rcu.h" @@ -113,8 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ -/* panic() on RCU Stall sysctl. */ -int sysctl_panic_on_rcu_stall __read_mostly; /* * The rcu_scheduler_active variable is initialized to the value @@ -157,7 +144,7 @@ static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -module_param(kthread_prio, int, 0644); +module_param(kthread_prio, int, 0444); /* Delay in jiffies for grace-period initialization delays, debug only. */ @@ -414,7 +401,7 @@ static bool rcu_kick_kthreads; */ static ulong jiffies_till_sched_qs = ULONG_MAX; module_param(jiffies_till_sched_qs, ulong, 0444); -static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ +static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */ module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ /* @@ -432,6 +419,7 @@ static void adjust_jiffies_till_sched_qs(void) WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); return; } + /* Otherwise, set to third fqs scan, but bound below on large system. */ j = READ_ONCE(jiffies_till_first_fqs) + 2 * READ_ONCE(jiffies_till_next_fqs); if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) @@ -479,7 +467,6 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next module_param(rcu_kick_kthreads, bool, 0644); static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); -static void force_quiescent_state(void); static int rcu_pending(void); /* @@ -504,13 +491,12 @@ unsigned long rcu_exp_batches_completed(void) EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); /* - * Force a quiescent state. + * Return the root node of the rcu_state structure. */ -void rcu_force_quiescent_state(void) +static struct rcu_node *rcu_get_root(void) { - force_quiescent_state(); + return &rcu_state.node[0]; } -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* * Convert a ->gp_state value to a character string. @@ -523,42 +509,6 @@ static const char *gp_state_getname(short gs) } /* - * Show the state of the grace-period kthreads. - */ -void show_rcu_gp_kthreads(void) -{ - int cpu; - unsigned long j; - struct rcu_data *rdp; - struct rcu_node *rnp; - - j = jiffies - READ_ONCE(rcu_state.gp_activity); - pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n", - rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, rcu_state.gp_kthread->state, j); - rcu_for_each_node_breadth_first(rnp) { - if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) - continue; - pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", - rnp->grplo, rnp->grphi, rnp->gp_seq, - rnp->gp_seq_needed); - if (!rcu_is_leaf_node(rnp)) - continue; - for_each_leaf_node_possible_cpu(rnp, cpu) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->gpwrap || - ULONG_CMP_GE(rcu_state.gp_seq, - rdp->gp_seq_needed)) - continue; - pr_info("\tcpu %d ->gp_seq_needed %lu\n", - cpu, rdp->gp_seq_needed); - } - } - /* sched_show_task(rcu_state.gp_kthread); */ -} -EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); - -/* * Send along grace-period-related data for rcutorture diagnostics. */ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, @@ -566,8 +516,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, { switch (test_type) { case RCU_FLAVOR: - case RCU_BH_FLAVOR: - case RCU_SCHED_FLAVOR: *flags = READ_ONCE(rcu_state.gp_flags); *gp_seq = rcu_seq_current(&rcu_state.gp_seq); break; @@ -578,14 +526,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); /* - * Return the root node of the rcu_state structure. - */ -static struct rcu_node *rcu_get_root(void) -{ - return &rcu_state.node[0]; -} - -/* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * @@ -701,7 +641,6 @@ static __always_inline void rcu_nmi_exit_common(bool irq) /** * rcu_nmi_exit - inform RCU of exit from NMI context - * @irq: Is this call from rcu_irq_exit? * * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. @@ -872,6 +811,7 @@ void rcu_nmi_enter(void) { rcu_nmi_enter_common(false); } +NOKPROBE_SYMBOL(rcu_nmi_enter); /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -1022,27 +962,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) } /* - * Handler for the irq_work request posted when a grace period has - * gone on for too long, but not yet long enough for an RCU CPU - * stall warning. Set state appropriately, but just complain if - * there is unexpected state on entry. - */ -static void rcu_iw_handler(struct irq_work *iwp) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - - rdp = container_of(iwp, struct rcu_data, rcu_iw); - rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp); - if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { - rdp->rcu_iw_gp_seq = rnp->gp_seq; - rdp->rcu_iw_pending = false; - } - raw_spin_unlock_rcu_node(rnp); -} - -/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -1115,7 +1034,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks! + * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq! * The above code handles this, but only for straight cond_resched(). * And some in-kernel loops check need_resched() before calling * cond_resched(), which defeats the above code for CPUs that are @@ -1155,295 +1074,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; } -static void record_gp_stall_check_time(void) -{ - unsigned long j = jiffies; - unsigned long j1; - - rcu_state.gp_start = j; - j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ - rcu_state.jiffies_resched = j + j1 / 2; - rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); -} - -/* - * Complain about starvation of grace-period kthread. - */ -static void rcu_check_gp_kthread_starvation(void) -{ - struct task_struct *gpk = rcu_state.gp_kthread; - unsigned long j; - - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", - rcu_state.name, j, - (long)rcu_seq_current(&rcu_state.gp_seq), - rcu_state.gp_flags, - gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); - if (gpk) { - pr_err("RCU grace-period kthread stack dump:\n"); - sched_show_task(gpk); - wake_up_process(gpk); - } - } -} - -/* - * Dump stacks of all tasks running on stalled CPUs. First try using - * NMIs, but fall back to manual remote stack tracing on architectures - * that don't support NMI-based stack dumps. The NMI-triggered stack - * traces are more accurate because they are printed by the target CPU. - */ -static void rcu_dump_cpu_stacks(void) -{ - int cpu; - unsigned long flags; - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * If too much time has passed in the current grace period, and if - * so configured, go kick the relevant kthreads. - */ -static void rcu_stall_kick_kthreads(void) -{ - unsigned long j; - - if (!rcu_kick_kthreads) - return; - j = READ_ONCE(rcu_state.jiffies_kick_kthreads); - if (time_after(jiffies, j) && rcu_state.gp_kthread && - (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { - WARN_ONCE(1, "Kicking %s grace-period kthread\n", - rcu_state.name); - rcu_ftrace_dump(DUMP_ALL); - wake_up_process(rcu_state.gp_kthread); - WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); - } -} - -static void panic_on_rcu_stall(void) -{ - if (sysctl_panic_on_rcu_stall) - panic("RCU Stall\n"); -} - -static void print_other_cpu_stall(unsigned long gp_seq) -{ - int cpu; - unsigned long flags; - unsigned long gpa; - unsigned long j; - int ndetected = 0; - struct rcu_node *rnp = rcu_get_root(); - long totqlen = 0; - - /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) - return; - - /* - * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); - print_cpu_stall_info_begin(); - rcu_for_each_leaf_node(rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - ndetected += rcu_print_task_stall(rnp); - if (rnp->qsmask != 0) { - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { - print_cpu_stall_info(cpu); - ndetected++; - } - } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - - print_cpu_stall_info_end(); - for_each_possible_cpu(cpu) - totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rcu_state.gp_start), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - if (ndetected) { - rcu_dump_cpu_stacks(); - - /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(); - } else { - if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { - pr_err("INFO: Stall ended before state dump start\n"); - } else { - j = jiffies; - gpa = READ_ONCE(rcu_state.gp_activity); - pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", - rcu_state.name, j - gpa, j, gpa, - READ_ONCE(jiffies_till_next_fqs), - rcu_get_root()->qsmask); - /* In this case, the current CPU might be at fault. */ - sched_show_task(current); - } - } - /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - - rcu_check_gp_kthread_starvation(); - - panic_on_rcu_stall(); - - force_quiescent_state(); /* Kick them all. */ -} - -static void print_cpu_stall(void) -{ - int cpu; - unsigned long flags; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp = rcu_get_root(); - long totqlen = 0; - - /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) - return; - - /* - * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); - print_cpu_stall_info_begin(); - raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); - print_cpu_stall_info(smp_processor_id()); - raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); - print_cpu_stall_info_end(); - for_each_possible_cpu(cpu) - totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rcu_state.gp_start, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - - rcu_check_gp_kthread_starvation(); - - rcu_dump_cpu_stacks(); - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - panic_on_rcu_stall(); - - /* - * Attempt to revive the RCU machinery by forcing a context switch. - * - * A context switch would normally allow the RCU state machine to make - * progress and it could be we're stuck in kernel space without context - * switches for an entirely unreasonable amount of time. - */ - set_tsk_need_resched(current); - set_preempt_need_resched(); -} - -static void check_cpu_stall(struct rcu_data *rdp) -{ - unsigned long gs1; - unsigned long gs2; - unsigned long gps; - unsigned long j; - unsigned long jn; - unsigned long js; - struct rcu_node *rnp; - - if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || - !rcu_gp_in_progress()) - return; - rcu_stall_kick_kthreads(); - j = jiffies; - - /* - * Lots of memory barriers to reject false positives. - * - * The idea is to pick up rcu_state.gp_seq, then - * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally - * another copy of rcu_state.gp_seq. These values are updated in - * the opposite order with memory barriers (or equivalent) during - * grace-period initialization and cleanup. Now, a false positive - * can occur if we get an new value of rcu_state.gp_start and a old - * value of rcu_state.jiffies_stall. But given the memory barriers, - * the only way that this can happen is if one grace period ends - * and another starts between these two fetches. This is detected - * by comparing the second fetch of rcu_state.gp_seq with the - * previous fetch from rcu_state.gp_seq. - * - * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, - * and rcu_state.gp_start suffice to forestall false positives. - */ - gs1 = READ_ONCE(rcu_state.gp_seq); - smp_rmb(); /* Pick up ->gp_seq first... */ - js = READ_ONCE(rcu_state.jiffies_stall); - smp_rmb(); /* ...then ->jiffies_stall before the rest... */ - gps = READ_ONCE(rcu_state.gp_start); - smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ - gs2 = READ_ONCE(rcu_state.gp_seq); - if (gs1 != gs2 || - ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) - return; /* No stall or GP completed since entering function. */ - rnp = rdp->mynode; - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - if (rcu_gp_in_progress() && - (READ_ONCE(rnp->qsmask) & rdp->grpmask) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(); - - } else if (rcu_gp_in_progress() && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2); - } -} - -/** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. - * - * The caller must disable hard irqs. - */ -void rcu_cpu_stall_reset(void) -{ - WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); -} - /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) @@ -1557,17 +1187,28 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) } /* - * Awaken the grace-period kthread. Don't do a self-awaken, and don't - * bother awakening when there is nothing for the grace-period kthread - * to do (as in several CPUs raced to awaken, and we lost), and finally - * don't try to awaken a kthread that has not yet been created. + * Awaken the grace-period kthread. Don't do a self-awaken (unless in + * an interrupt or softirq handler), and don't bother awakening when there + * is nothing for the grace-period kthread to do (as in several CPUs raced + * to awaken, and we lost), and finally don't try to awaken a kthread that + * has not yet been created. If all those checks are passed, track some + * debug information and awaken. + * + * So why do the self-wakeup when in an interrupt or softirq handler + * in the grace-period kthread's context? Because the kthread might have + * been interrupted just as it was going to sleep, and just after the final + * pre-sleep check of the awaken condition. In this case, a wakeup really + * is required, and is therefore supplied. */ static void rcu_gp_kthread_wake(void) { - if (current == rcu_state.gp_kthread || + if ((current == rcu_state.gp_kthread && + !in_irq() && !in_serving_softirq()) || !READ_ONCE(rcu_state.gp_flags) || !rcu_state.gp_kthread) return; + WRITE_ONCE(rcu_state.gp_wake_time, jiffies); + WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); swake_up_one(&rcu_state.gp_wq); } @@ -1711,7 +1352,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) zero_cpu_stall_ticks(rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ - if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) + if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) rdp->gp_seq_needed = rnp->gp_seq_needed; WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); @@ -1939,7 +1580,7 @@ static void rcu_gp_fqs_loop(void) if (!ret) { rcu_state.jiffies_force_qs = jiffies + j; WRITE_ONCE(rcu_state.jiffies_kick_kthreads, - jiffies + 3 * j); + jiffies + (j ? 3 * j : 2)); } trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), @@ -2272,11 +1913,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { - rdp->core_needs_qs = false; - /* * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. @@ -2497,14 +2137,14 @@ static void rcu_do_batch(struct rcu_data *rdp) } /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. + * This function is invoked from each scheduling-clock interrupt, + * and checks to see if this CPU is in a non-context-switch quiescent + * state, for example, user mode or idle loop. It also schedules RCU + * core processing. If the current grace period has gone on too long, + * it will ask the scheduler to manufacture a context switch for the sole + * purpose of providing a providing the needed quiescent state. */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); raw_cpu_inc(rcu_data.ticks_this_gp); @@ -2517,7 +2157,7 @@ void rcu_check_callbacks(int user) } __this_cpu_write(rcu_data.rcu_urgent_qs, false); } - rcu_flavor_check_callbacks(user); + rcu_flavor_sched_clock_irq(user); if (rcu_pending()) invoke_rcu_core(); @@ -2525,11 +2165,11 @@ void rcu_check_callbacks(int user) } /* - * Scan the leaf rcu_node structures, processing dyntick state for any that - * have not yet encountered a quiescent state, using the function specified. - * Also initiate boosting for any threads blocked on the root rcu_node. - * - * The caller must have suppressed start of new grace periods. + * Scan the leaf rcu_node structures. For each structure on which all + * CPUs have reported a quiescent state and on which there are tasks + * blocking the current grace period, initiate RCU priority boosting. + * Otherwise, invoke the specified function to check dyntick state for + * each CPU that has not yet reported a quiescent state. */ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) { @@ -2578,7 +2218,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(void) +void rcu_force_quiescent_state(void) { unsigned long flags; bool ret; @@ -2610,113 +2250,10 @@ static void force_quiescent_state(void) raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); rcu_gp_kthread_wake(); } +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); -/* - * This function checks for grace-period requests that fail to motivate - * RCU to come out of its idle mode. - */ -void -rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, - const unsigned long gpssdelay) -{ - unsigned long flags; - unsigned long j; - struct rcu_node *rnp_root = rcu_get_root(); - static atomic_t warned = ATOMIC_INIT(0); - - if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) - return; - j = jiffies; /* Expensive access, and in common case don't get here. */ - if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || - atomic_read(&warned)) - return; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - j = jiffies; - if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || - atomic_read(&warned)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - /* Hold onto the leaf lock to make others see warned==1. */ - - if (rnp_root != rnp) - raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ - j = jiffies; - if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rcu_state.gp_req_activity + gpssdelay) || - time_before(j, rcu_state.gp_activity + gpssdelay) || - atomic_xchg(&warned, 1)) { - raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", - __func__, (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rnp_root->gp_seq_needed), - j - rcu_state.gp_req_activity, j - rcu_state.gp_activity, - rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name, - rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL); - WARN_ON(1); - if (rnp_root != rnp) - raw_spin_unlock_rcu_node(rnp_root); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Do a forward-progress check for rcutorture. This is normally invoked - * due to an OOM event. The argument "j" gives the time period during - * which rcutorture would like progress to have been made. - */ -void rcu_fwd_progress_check(unsigned long j) -{ - unsigned long cbs; - int cpu; - unsigned long max_cbs = 0; - int max_cpu = -1; - struct rcu_data *rdp; - - if (rcu_gp_in_progress()) { - pr_info("%s: GP age %lu jiffies\n", - __func__, jiffies - rcu_state.gp_start); - show_rcu_gp_kthreads(); - } else { - pr_info("%s: Last GP end %lu jiffies ago\n", - __func__, jiffies - rcu_state.gp_end); - preempt_disable(); - rdp = this_cpu_ptr(&rcu_data); - rcu_check_gp_start_stall(rdp->mynode, rdp, j); - preempt_enable(); - } - for_each_possible_cpu(cpu) { - cbs = rcu_get_n_cbs_cpu(cpu); - if (!cbs) - continue; - if (max_cpu < 0) - pr_info("%s: callbacks", __func__); - pr_cont(" %d: %lu", cpu, cbs); - if (cbs <= max_cbs) - continue; - max_cbs = cbs; - max_cpu = cpu; - } - if (max_cpu >= 0) - pr_cont("\n"); -} -EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); - -/* - * This does the RCU core processing work for the specified rcu_data - * structures. This may be called only from the CPU to whom the rdp - * belongs. - */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +/* Perform RCU core processing work for the current CPU. */ +static __latent_entropy void rcu_core(struct softirq_action *unused) { unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); @@ -2801,9 +2338,9 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, /* * Force the grace period if too many callbacks or too long waiting. - * Enforce hysteresis, and don't invoke force_quiescent_state() + * Enforce hysteresis, and don't invoke rcu_force_quiescent_state() * if some other CPU has recently done so. Also, don't bother - * invoking force_quiescent_state() if the newly enqueued callback + * invoking rcu_force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > @@ -2820,7 +2357,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rdp->blimit = LONG_MAX; if (rcu_state.n_force_qs == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) - force_quiescent_state(); + rcu_force_quiescent_state(); rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } @@ -2855,7 +2392,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * Use rcu:rcu_callback trace event to find the previous * time callback was passed to __call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); return; @@ -2889,9 +2426,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) rcu_segcblist_init(&rdp->cblist); } rcu_segcblist_enqueue(&rdp->cblist, head, lazy); - if (!lazy) - rcu_idle_count_callbacks_posted(); - if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, (unsigned long)func, @@ -2961,6 +2495,79 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(kfree_call_rcu); +/* + * During early boot, any blocking grace-period wait automatically + * implies a grace period. Later on, this is never the case for PREEMPT. + * + * Howevr, because a context switch is a grace period for !PREEMPT, any + * blocking grace-period wait automatically implies a grace period if + * there is only one CPU online at any point time during execution of + * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds some + * overhead: RCU still operates correctly. + */ +static int rcu_blocking_is_gp(void) +{ + int ret; + + if (IS_ENABLED(CONFIG_PREEMPT)) + return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; + might_sleep(); /* Check for RCU read-side critical section. */ + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; +} + +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * In addition, regions of code across which interrupts, preemption, or + * softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last RCU read-side critical section whose beginning + * preceded the call to synchronize_rcu(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_rcu() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_rcu() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + */ +void synchronize_rcu(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); + if (rcu_blocking_is_gp()) + return; + if (rcu_gp_is_expedited()) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + /** * get_state_synchronize_rcu - Snapshot current RCU state * @@ -3049,28 +2656,6 @@ static int rcu_pending(void) } /* - * Return true if the specified CPU has any callback. If all_lazy is - * non-NULL, store an indication of whether all callbacks are lazy. - * (If there are no callbacks, all of them are deemed to be lazy.) - */ -static bool rcu_cpu_has_callbacks(bool *all_lazy) -{ - bool al = true; - bool hc = false; - struct rcu_data *rdp; - - rdp = this_cpu_ptr(&rcu_data); - if (!rcu_segcblist_empty(&rdp->cblist)) { - hc = true; - if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) - al = false; - } - if (all_lazy) - *all_lazy = al; - return hc; -} - -/* * Helper function for rcu_barrier() tracing. If tracing is disabled, * the compiler is expected to optimize this away. */ @@ -3299,7 +2884,7 @@ int rcutree_prepare_cpu(unsigned int cpu) trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); - rcu_spawn_all_nocb_kthreads(cpu); + rcu_spawn_cpu_nocb_kthread(cpu); return 0; } @@ -3329,8 +2914,6 @@ int rcutree_online_cpu(unsigned int cpu) raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->ffmask |= rdp->grpmask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_online_cpu(cpu); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ sync_sched_exp_online_cleanup(cpu); @@ -3355,8 +2938,6 @@ int rcutree_offline_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcutree_affinity_setting(cpu, cpu); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_offline_cpu(cpu); return 0; } @@ -3500,13 +3081,11 @@ static int rcu_pm_notify(struct notifier_block *self, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_expedite_gp(); + rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_unexpedite_gp(); + rcu_unexpedite_gp(); break; default: break; @@ -3683,8 +3262,7 @@ static void __init rcu_init_geometry(void) jiffies_till_first_fqs = d; if (jiffies_till_next_fqs == ULONG_MAX) jiffies_till_next_fqs = d; - if (jiffies_till_sched_qs == ULONG_MAX) - adjust_jiffies_till_sched_qs(); + adjust_jiffies_till_sched_qs(); /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == RCU_FANOUT_LEAF && @@ -3777,7 +3355,7 @@ void __init rcu_init(void) rcu_init_one(); if (dump_tree) rcu_dump_rcu_node_tree(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + open_softirq(RCU_SOFTIRQ, rcu_core); /* * We don't need protection against CPU-hotplug here because @@ -3799,5 +3377,6 @@ void __init rcu_init(void) srcu_init(); } +#include "tree_stall.h" #include "tree_exp.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d90b02b53c0e..e253d11af3c4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -1,25 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * * Author: Ingo Molnar <mingo@elte.hu> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/cache.h> @@ -36,7 +23,6 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { - smp_call_func_t rew_func; unsigned long rew_s; struct work_struct rew_work; }; @@ -194,10 +180,7 @@ struct rcu_data { bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* Are all CPU's CBs lazy? */ - unsigned long nonlazy_posted; /* # times non-lazy CB posted to CPU. */ - unsigned long nonlazy_posted_snap; - /* Nonlazy_posted snapshot. */ + bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ @@ -234,7 +217,13 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 6) Diagnostic data, including RCU CPU stall warnings. */ + /* 6) RCU priority boosting. */ + struct task_struct *rcu_cpu_kthread_task; + /* rcuc per-CPU kthread or NULL. */ + unsigned int rcu_cpu_kthread_status; + char rcu_cpu_has_work; + + /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ @@ -303,6 +292,8 @@ struct rcu_state { struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ + unsigned long gp_wake_time; /* Last GP kthread wake. */ + unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ /* End of fields guarded by root rcu_node's lock. */ @@ -402,25 +393,16 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; int rcu_dynticks_snap(struct rcu_data *rdp); -#ifdef CONFIG_RCU_BOOST -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DECLARE_PER_CPU(char, rcu_cpu_has_work); -#endif /* #ifdef CONFIG_RCU_BOOST */ - -/* Forward declarations for rcutree_plugin.h */ +/* Forward declarations for tree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_print_detail_task_stall(void); -static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -static void rcu_flavor_check_callbacks(int user); +static void rcu_flavor_sched_clock_irq(int user); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); @@ -431,13 +413,9 @@ static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); static void rcu_prepare_for_idle(void); -static void rcu_idle_count_callbacks_posted(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); -static void print_cpu_stall_info_begin(void); -static void print_cpu_stall_info(int cpu); -static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static bool rcu_nocb_cpu_needs_barrier(int cpu); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); @@ -451,7 +429,7 @@ static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); -static void rcu_spawn_all_nocb_kthreads(int cpu); +static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); @@ -463,10 +441,9 @@ static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); -#ifdef CONFIG_SRCU -void srcu_online_cpu(unsigned int cpu); -void srcu_offline_cpu(unsigned int cpu); -#else /* #ifdef CONFIG_SRCU */ -void srcu_online_cpu(unsigned int cpu) { } -void srcu_offline_cpu(unsigned int cpu) { } -#endif /* #else #ifdef CONFIG_SRCU */ +/* Forward declarations for tree_stall.h */ +static void record_gp_stall_check_time(void); +static void rcu_iw_handler(struct irq_work *iwp); +static void check_cpu_stall(struct rcu_data *rdp); +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 928fe5893a57..9c990df880d1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -1,27 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU expedited grace periods * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2016 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/lockdep.h> +static void rcu_exp_handler(void *unused); +static int rcu_print_task_exp_stall(struct rcu_node *rnp); + /* * Record the start of an expedited grace period. */ @@ -344,7 +334,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) { int cpu; unsigned long flags; - smp_call_func_t func; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; @@ -352,7 +341,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); - func = rewp->rew_func; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ @@ -396,7 +384,7 @@ retry_ipi: mask_ofl_test |= mask; continue; } - ret = smp_call_function_single(cpu, func, NULL, 0); + ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); if (!ret) { mask_ofl_ipi &= ~mask; continue; @@ -426,7 +414,7 @@ retry_ipi: * Select the nodes that the upcoming expedited grace period needs * to wait for. */ -static void sync_rcu_exp_select_cpus(smp_call_func_t func) +static void sync_rcu_exp_select_cpus(void) { int cpu; struct rcu_node *rnp; @@ -440,7 +428,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - rnp->rew.rew_func = func; if (!READ_ONCE(rcu_par_gp_wq) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { @@ -449,7 +436,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) continue; } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - preempt_disable(); cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); /* If all offline, queue the work on an unbound CPU. */ if (unlikely(cpu > rnp->grphi - rnp->grplo)) @@ -457,7 +443,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) else cpu += rnp->grplo; queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); - preempt_enable(); rnp->exp_need_flush = true; } @@ -580,10 +565,10 @@ static void rcu_exp_wait_wake(unsigned long s) * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. */ -static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s) +static void rcu_exp_sel_wait_wake(unsigned long s) { /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(func); + sync_rcu_exp_select_cpus(); /* Wait and clean up, including waking everyone. */ rcu_exp_wait_wake(s); @@ -597,52 +582,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) struct rcu_exp_work *rewp; rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s); -} - -/* - * Given a smp_call_function() handler, kick off the specified - * implementation of expedited grace period. - */ -static void _synchronize_rcu_expedited(smp_call_func_t func) -{ - struct rcu_data *rdp; - struct rcu_exp_work rew; - struct rcu_node *rnp; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(); - if (exp_funnel_lock(s)) - return; /* Someone else did our work for us. */ - - /* Ensure that load happens before action based on it. */ - if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { - /* Direct call during scheduler init and early_initcalls(). */ - rcu_exp_sel_wait_wake(func, s); - } else { - /* Marshall arguments & schedule the expedited grace period. */ - rew.rew_func = func; - rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); - } - - /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); - rnp = rcu_get_root(); - wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ - - /* Let the next expedited grace period start. */ - mutex_unlock(&rcu_state.exp_mutex); + rcu_exp_sel_wait_wake(rewp->rew_s); } #ifdef CONFIG_PREEMPT_RCU @@ -654,7 +594,7 @@ static void _synchronize_rcu_expedited(smp_call_func_t func) * ->expmask fields in the rcu_node tree. Otherwise, immediately * report the quiescent state. */ -static void sync_rcu_exp_handler(void *unused) +static void rcu_exp_handler(void *unused) { unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -694,9 +634,10 @@ static void sync_rcu_exp_handler(void *unused) raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { rdp->deferred_qs = true; - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); + t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; } /* @@ -708,7 +649,7 @@ static void sync_rcu_exp_handler(void *unused) * * If the CPU is fully enabled (or if some buggy RCU-preempt * read-side critical section is being used from idle), just - * invoke rcu_preempt_defer_qs() to immediately report the + * invoke rcu_preempt_deferred_qs() to immediately report the * quiescent state. We cannot use rcu_read_unlock_special() * because we are in an interrupt handler, which will cause that * function to take an early exit without doing anything. @@ -730,43 +671,31 @@ static void sync_sched_exp_online_cleanup(int cpu) { } -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state. On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code. In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - * - * This has the same semantics as (but is more brutal than) synchronize_rcu(). +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each that is blocking the current + * expedited grace period. */ -void synchronize_rcu_expedited(void) +static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) - return; - _synchronize_rcu_expedited(sync_rcu_exp_handler); + struct task_struct *t; + int ndetected = 0; + + if (!rnp->exp_tasks) + return 0; + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + return ndetected; } -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #else /* #ifdef CONFIG_PREEMPT_RCU */ /* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *unused) +static void rcu_exp_handler(void *unused) { struct rcu_data *rdp; struct rcu_node *rnp; @@ -798,44 +727,88 @@ static void sync_sched_exp_online_cleanup(int cpu) rnp = rdp->mynode; if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0); + ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); WARN_ON_ONCE(ret); } /* - * Because a context switch is a grace period for !PREEMPT, any - * blocking grace-period wait automatically implies a grace period if - * there is only one CPU online at any point time during execution of - * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds some - * overhead: RCU still operates correctly. + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections that are + * blocking the current expedited grace period. */ -static int rcu_blocking_is_gp(void) +static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - int ret; - - might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - ret = num_online_cpus() <= 1; - preempt_enable(); - return ret; + return 0; } -/* PREEMPT=n implementation of synchronize_rcu_expedited(). */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU grace period, but expedite it. The basic idea is to + * IPI all non-idle non-nohz online CPUs. The IPI handler checks whether + * the CPU is in an RCU critical section, and if so, it sets a flag that + * causes the outermost rcu_read_unlock() to report the quiescent state + * for RCU-preempt or asks the scheduler for help for RCU-sched. On the + * other hand, if the CPU is not in an RCU read-side critical section, + * the IPI handler reports the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + * + * This has the same semantics as (but is more brutal than) synchronize_rcu(). + */ void synchronize_rcu_expedited(void) { + struct rcu_data *rdp; + struct rcu_exp_work rew; + struct rcu_node *rnp; + unsigned long s; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - /* If only one CPU, this is automatically a grace period. */ + /* Is the state is such that the call is a grace period? */ if (rcu_blocking_is_gp()) return; - _synchronize_rcu_expedited(sync_sched_exp_handler); + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(); + if (exp_funnel_lock(s)) + return; /* Someone else did our work for us. */ + + /* Ensure that load happens before action based on it. */ + if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { + /* Direct call during scheduler init and early_initcalls(). */ + rcu_exp_sel_wait_wake(s); + } else { + /* Marshall arguments & schedule the expedited grace period. */ + rew.rew_s = s; + INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew.rew_work); + } + + /* Wait for expedited grace period to complete. */ + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); + rnp = rcu_get_root(); + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], + sync_exp_work_done(s)); + smp_mb(); /* Workqueue actions happen before return. */ + + /* Let the next expedited grace period start. */ + mutex_unlock(&rcu_state.exp_mutex); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1b3dd2fc0cd6..1102765f91fd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1,27 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions that provide either classic * or preemptible semantics. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright Red Hat, 2009 * Copyright IBM Corporation, 2009 * * Author: Ingo Molnar <mingo@elte.hu> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/delay.h> @@ -34,17 +21,7 @@ #include "../time/tick-internal.h" #ifdef CONFIG_RCU_BOOST - #include "../locking/rtmutex_common.h" - -/* - * Control variables for per-CPU and per-rcu_node kthreads. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - #else /* #ifdef CONFIG_RCU_BOOST */ /* @@ -307,8 +284,8 @@ static void rcu_qs(void) __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); - barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */ - current->rcu_read_unlock_special.b.need_qs = false; + barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false); } } @@ -666,100 +643,6 @@ static void rcu_read_unlock_special(struct task_struct *t) } /* - * Dump detailed information for all tasks blocking the current RCU - * grace period on the specified rcu_node structure. - */ -static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) -{ - unsigned long flags; - struct task_struct *t; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - t = list_entry(rnp->gp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - /* - * We could be printing a lot while holding a spinlock. - * Avoid triggering hard lockup. - */ - touch_nmi_watchdog(); - sched_show_task(t); - } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Dump detailed information for all tasks blocking the current RCU - * grace period. - */ -static void rcu_print_detail_task_stall(void) -{ - struct rcu_node *rnp = rcu_get_root(); - - rcu_print_detail_task_stall_rnp(rnp); - rcu_for_each_leaf_node(rnp) - rcu_print_detail_task_stall_rnp(rnp); -} - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ - pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", - rnp->level, rnp->grplo, rnp->grphi); -} - -static void rcu_print_task_stall_end(void) -{ - pr_cont("\n"); -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ - struct task_struct *t; - int ndetected = 0; - - if (!rcu_preempt_blocked_readers_cgp(rnp)) - return 0; - rcu_print_task_stall_begin(rnp); - t = list_entry(rnp->gp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); - ndetected++; - } - rcu_print_task_stall_end(); - return ndetected; -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each that is blocking the current - * expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ - struct task_struct *t; - int ndetected = 0; - - if (!rnp->exp_tasks) - return 0; - t = list_entry(rnp->exp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); - ndetected++; - } - return ndetected; -} - -/* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be @@ -788,13 +671,13 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) } /* - * Check for a quiescent state from the current CPU. When a task blocks, - * the task is recorded in the corresponding CPU's rcu_node structure, - * which is checked elsewhere. - * - * Caller must disable hard irqs. + * Check for a quiescent state from the current CPU, including voluntary + * context switches for Tasks RCU. When a task blocks, the task is + * recorded in the corresponding CPU's rcu_node structure, which is checked + * elsewhere, hence this function need only check for quiescent states + * related to the current CPU, not to those related to tasks. */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user) { struct task_struct *t = current; @@ -825,69 +708,27 @@ static void rcu_flavor_check_callbacks(int user) t->rcu_read_unlock_special.b.need_qs = true; } -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. Note, however, that - * upon return from synchronize_rcu(), the caller might well be executing - * concurrently with new RCU read-side critical sections that began while - * synchronize_rcu() was waiting. RCU read-side critical sections are - * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. - * In addition, regions of code across which interrupts, preemption, or - * softirqs have been disabled also serve as RCU read-side critical - * sections. This includes hardware interrupt handlers, softirq handlers, - * and NMI handlers. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last RCU read-side critical section whose beginning - * preceded the call to synchronize_rcu(). In addition, each CPU having - * an RCU read-side critical section that extends beyond the return from - * synchronize_rcu() is guaranteed to have executed a full memory barrier - * after the beginning of synchronize_rcu() and before the beginning of - * that RCU read-side critical section. Note that these guarantees include - * CPUs that are offline, idle, or executing in user mode, as well as CPUs - * that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - */ -void synchronize_rcu(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - /* * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. + * critical section, clean up if so. No need to issue warnings, as + * debug_check_no_locks_held() already does this if lockdep is enabled. + * Besides, if this function does anything other than just immediately + * return, there was a bug of some sort. Spewing warnings from this + * function is like as not to simply obscure important prior warnings. */ void exit_rcu(void) { struct task_struct *t = current; - if (likely(list_empty(¤t->rcu_node_entry))) + if (unlikely(!list_empty(¤t->rcu_node_entry))) { + t->rcu_read_lock_nesting = 1; + barrier(); + WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); + } else if (unlikely(t->rcu_read_lock_nesting)) { + t->rcu_read_lock_nesting = 1; + } else { return; - t->rcu_read_lock_nesting = 1; - barrier(); - t->rcu_read_unlock_special.b.blocked = true; + } __rcu_read_unlock(); rcu_preempt_deferred_qs(current); } @@ -1051,33 +892,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { } /* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static void rcu_print_detail_task_stall(void) -{ -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ - return 0; -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections that are - * blocking the current expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ - return 0; -} - -/* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. @@ -1088,14 +902,10 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) } /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. + * Check to see if this CPU is in a non-context-switch quiescent state, + * namely user mode and idle loop. */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user) { if (user || rcu_is_cpu_rrupt_from_idle()) { @@ -1115,22 +925,6 @@ static void rcu_flavor_check_callbacks(int user) } } -/* PREEMPT=n implementation of synchronize_rcu(). */ -void synchronize_rcu(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - /* * Because preemptible RCU does not exist, tasks cannot possibly exit * while in preemptible RCU read-side critical sections. @@ -1276,8 +1070,6 @@ static int rcu_boost_kthread(void *arg) static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { - struct task_struct *t; - raw_lockdep_assert_held_rcu_node(rnp); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1291,9 +1083,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - t = rnp->boost_kthread_task; - if (t) - rcu_wake_cond(t, rnp->boost_kthread_status); + rcu_wake_cond(rnp->boost_kthread_task, + rnp->boost_kthread_status); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -1307,11 +1098,11 @@ static void invoke_rcu_callbacks_kthread(void) unsigned long flags; local_irq_save(flags); - __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) { - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), - __this_cpu_read(rcu_cpu_kthread_status)); + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && + current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { + rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), + __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); } local_irq_restore(flags); } @@ -1322,7 +1113,7 @@ static void invoke_rcu_callbacks_kthread(void) */ static bool rcu_is_callbacks_kthread(void) { - return __this_cpu_read(rcu_cpu_kthread_task) == current; + return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; } #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) @@ -1369,11 +1160,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) return 0; } -static void rcu_kthread_do_work(void) -{ - rcu_do_batch(this_cpu_ptr(&rcu_data)); -} - static void rcu_cpu_kthread_setup(unsigned int cpu) { struct sched_param sp; @@ -1384,12 +1170,12 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) static void rcu_cpu_kthread_park(unsigned int cpu) { - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; } static int rcu_cpu_kthread_should_run(unsigned int cpu) { - return __this_cpu_read(rcu_cpu_has_work); + return __this_cpu_read(rcu_data.rcu_cpu_has_work); } /* @@ -1399,21 +1185,20 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) */ static void rcu_cpu_kthread(unsigned int cpu) { - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); + unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); int spincnt; for (spincnt = 0; spincnt < 10; spincnt++) { trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; - this_cpu_inc(rcu_cpu_kthread_loops); local_irq_disable(); work = *workp; *workp = 0; local_irq_enable(); if (work) - rcu_kthread_do_work(); + rcu_do_batch(this_cpu_ptr(&rcu_data)); local_bh_enable(); if (*workp == 0) { trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); @@ -1459,7 +1244,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) } static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_cpu_kthread_task, + .store = &rcu_data.rcu_cpu_kthread_task, .thread_should_run = rcu_cpu_kthread_should_run, .thread_fn = rcu_cpu_kthread, .thread_comm = "rcuc/%u", @@ -1476,7 +1261,7 @@ static void __init rcu_spawn_boost_kthreads(void) int cpu; for_each_possible_cpu(cpu) - per_cpu(rcu_cpu_has_work, cpu) = 0; + per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) return; rcu_for_each_leaf_node(rnp) @@ -1543,7 +1328,7 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return rcu_cpu_has_callbacks(NULL); + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); } /* @@ -1562,14 +1347,6 @@ static void rcu_prepare_for_idle(void) { } -/* - * Don't bother keeping a running count of the number of RCU callbacks - * posted because CONFIG_RCU_FAST_NO_HZ=n. - */ -static void rcu_idle_count_callbacks_posted(void) -{ -} - #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ /* @@ -1652,11 +1429,8 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) lockdep_assert_irqs_disabled(); - /* Snapshot to detect later posting of non-lazy callback. */ - rdp->nonlazy_posted_snap = rdp->nonlazy_posted; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) { + if (rcu_segcblist_empty(&rdp->cblist)) { *nextevt = KTIME_MAX; return 0; } @@ -1670,11 +1444,12 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) rdp->last_accelerate = jiffies; /* Request timer delay depending on laziness, and round. */ - if (!rdp->all_lazy) { + rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); + if (rdp->all_lazy) { + dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; + } else { dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; - } else { - dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } *nextevt = basemono + dj * TICK_NSEC; return 0; @@ -1704,7 +1479,7 @@ static void rcu_prepare_for_idle(void) /* Handle nohz enablement switches conservatively. */ tne = READ_ONCE(tick_nohz_active); if (tne != rdp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(NULL)) + if (!rcu_segcblist_empty(&rdp->cblist)) invoke_rcu_core(); /* force nohz to see update. */ rdp->tick_nohz_enabled_snap = tne; return; @@ -1717,10 +1492,8 @@ static void rcu_prepare_for_idle(void) * callbacks, invoke RCU core for the side-effect of recalculating * idle duration on re-entry to idle. */ - if (rdp->all_lazy && - rdp->nonlazy_posted != rdp->nonlazy_posted_snap) { + if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { rdp->all_lazy = false; - rdp->nonlazy_posted_snap = rdp->nonlazy_posted; invoke_rcu_core(); return; } @@ -1756,142 +1529,49 @@ static void rcu_cleanup_after_idle(void) invoke_rcu_core(); } -/* - * Keep a running count of the number of non-lazy callbacks posted - * on this CPU. This running counter (which is never decremented) allows - * rcu_prepare_for_idle() to detect when something out of the idle loop - * posts a callback, even if an equal number of callbacks are invoked. - * Of course, callbacks should only be posted from within a trace event - * designed to be called from idle or from within RCU_NONIDLE(). - */ -static void rcu_idle_count_callbacks_posted(void) -{ - __this_cpu_add(rcu_data.nonlazy_posted, 1); -} - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap; - - sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", - rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ulong2long(nlpd), - rdp->all_lazy ? 'L' : '.', - rdp->tick_nohz_enabled_snap ? '.' : 'D'); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - *cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - -/* Initiate the stall-info list. */ -static void print_cpu_stall_info_begin(void) -{ - pr_cont("\n"); -} - -/* - * Print out diagnostic information for the specified stalled CPU. - * - * If the specified CPU is aware of the current RCU grace period, then - * print the number of scheduling clock interrupts the CPU has taken - * during the time that it has been aware. Otherwise, print the number - * of RCU grace periods that this CPU is ignorant of, for example, "1" - * if the CPU was aware of the previous grace period. - * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. - */ -static void print_cpu_stall_info(int cpu) -{ - unsigned long delta; - char fast_no_hz[72]; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - char *ticks_title; - unsigned long ticks_value; - - /* - * We could be printing a lot while holding a spinlock. Avoid - * triggering hard lockup. - */ - touch_nmi_watchdog(); - - ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); - if (ticks_value) { - ticks_title = "GPs behind"; - } else { - ticks_title = "ticks this GP"; - ticks_value = rdp->ticks_this_gp; - } - print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", - cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], - "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], - !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : - rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : - "!."[!delta], - ticks_value, ticks_title, - rcu_dynticks_snap(rdp) & 0xfff, - rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, - rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz); -} - -/* Terminate the stall-info list. */ -static void print_cpu_stall_info_end(void) -{ - pr_err("\t"); -} - -/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ - rdp->ticks_this_gp = 0; - rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); - WRITE_ONCE(rdp->last_fqs_resched, jiffies); -} - #ifdef CONFIG_RCU_NOCB_CPU /* * Offload callback processing from the boot-time-specified set of CPUs - * specified by rcu_nocb_mask. For each CPU in the set, there is a - * kthread created that pulls the callbacks from the corresponding CPU, - * waits for a grace period to elapse, and invokes the callbacks. - * The no-CBs CPUs do a wake_up() on their kthread when they insert - * a callback into any empty list, unless the rcu_nocb_poll boot parameter - * has been specified, in which case each kthread actively polls its - * CPU. (Which isn't so great for energy efficiency, but which does - * reduce RCU's overhead on that CPU.) + * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads + * created that pull the callbacks from the corresponding CPU, wait for + * a grace period to elapse, and invoke the callbacks. These kthreads + * are organized into leaders, which manage incoming callbacks, wait for + * grace periods, and awaken followers, and the followers, which only + * invoke callbacks. Each leader is its own follower. The no-CBs CPUs + * do a wake_up() on their kthread when they insert a callback into any + * empty list, unless the rcu_nocb_poll boot parameter has been specified, + * in which case each kthread actively polls its CPU. (Which isn't so great + * for energy efficiency, but which does reduce RCU's overhead on that CPU.) * * This is intended to be used in conjunction with Frederic Weisbecker's * adaptive-idle work, which would seriously reduce OS jitter on CPUs * running CPU-bound user-mode computations. * - * Offloading of callback processing could also in theory be used as - * an energy-efficiency measure because CPUs with no RCU callbacks - * queued are more aggressive about entering dyntick-idle mode. + * Offloading of callbacks can also be used as an energy-efficiency + * measure because CPUs with no RCU callbacks queued are more aggressive + * about entering dyntick-idle mode. */ -/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a + * comma-separated list of CPUs and/or CPU ranges. If an invalid list is + * given, a warning is emitted and all CPUs are offloaded. + */ static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - cpulist_parse(str, rcu_nocb_mask); + if (!strcasecmp(str, "all")) + cpumask_setall(rcu_nocb_mask); + else + if (cpulist_parse(str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } return 1; } __setup("rcu_nocbs=", rcu_nocb_setup); @@ -1987,10 +1667,7 @@ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } -/* - * Does the specified CPU need an RCU callback for this invocation - * of rcu_barrier()? - */ +/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */ static bool rcu_nocb_cpu_needs_barrier(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); @@ -2006,8 +1683,8 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu) * callbacks would be posted. In the worst case, the first * barrier in rcu_barrier() suffices (but the caller cannot * necessarily rely on this, not a substitute for the caller - * getting the concurrency design right!). There must also be - * a barrier between the following load an posting of a callback + * getting the concurrency design right!). There must also be a + * barrier between the following load and posting of a callback * (if a callback is in fact needed). This is associated with an * atomic_inc() in the caller. */ @@ -2517,9 +2194,9 @@ static void rcu_spawn_one_nocb_kthread(int cpu) /* * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthreads, spawn them. + * rcuo kthread, spawn it. */ -static void rcu_spawn_all_nocb_kthreads(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { if (rcu_scheduler_fully_active) rcu_spawn_one_nocb_kthread(cpu); @@ -2536,7 +2213,7 @@ static void __init rcu_spawn_nocb_kthreads(void) int cpu; for_each_online_cpu(cpu) - rcu_spawn_all_nocb_kthreads(cpu); + rcu_spawn_cpu_nocb_kthread(cpu); } /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ @@ -2670,7 +2347,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { } -static void rcu_spawn_all_nocb_kthreads(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h new file mode 100644 index 000000000000..f65a73a97323 --- /dev/null +++ b/kernel/rcu/tree_stall.h @@ -0,0 +1,709 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * RCU CPU stall warnings for normal RCU grace periods + * + * Copyright IBM Corporation, 2019 + * + * Author: Paul E. McKenney <paulmck@linux.ibm.com> + */ + +////////////////////////////////////////////////////////////////////////////// +// +// Controlling CPU stall warnings, including delay calculation. + +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +/* Limit-check stall timeouts specified at boottime and runtime. */ +int rcu_jiffies_till_stall_check(void) +{ + int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + WRITE_ONCE(rcu_cpu_stall_timeout, 3); + till_stall_check = 3; + } else if (till_stall_check > 300) { + WRITE_ONCE(rcu_cpu_stall_timeout, 300); + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} +EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); + +/* Don't do RCU CPU stall warnings during long sysrq printouts. */ +void rcu_sysrq_start(void) +{ + if (!rcu_cpu_stall_suppress) + rcu_cpu_stall_suppress = 2; +} + +void rcu_sysrq_end(void) +{ + if (rcu_cpu_stall_suppress == 2) + rcu_cpu_stall_suppress = 0; +} + +/* Don't print RCU CPU stall warnings during a kernel panic. */ +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_suppress = 1; + return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); + return 0; +} +early_initcall(check_cpu_stall_init); + +/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */ +static void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Interaction with RCU grace periods + +/* Start of new grace period, so record stall time (and forcing times). */ +static void record_gp_stall_check_time(void) +{ + unsigned long j = jiffies; + unsigned long j1; + + rcu_state.gp_start = j; + j1 = rcu_jiffies_till_stall_check(); + /* Record ->gp_start before ->jiffies_stall. */ + smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + rcu_state.jiffies_resched = j + j1 / 2; + rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); +} + +/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ + rdp->ticks_this_gp = 0; + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); +} + +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(void) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rcu_state.jiffies_kick_kthreads); + if (time_after(jiffies, j) && rcu_state.gp_kthread && + (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", + rcu_state.name); + rcu_ftrace_dump(DUMP_ALL); + wake_up_process(rcu_state.gp_kthread); + WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); + } +} + +/* + * Handler for the irq_work request posted about halfway into the RCU CPU + * stall timeout, and used to detect excessive irq disabling. Set state + * appropriately, but just complain if there is unexpected state on entry. + */ +static void rcu_iw_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = container_of(iwp, struct rcu_data, rcu_iw); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); + if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { + rdp->rcu_iw_gp_seq = rnp->gp_seq; + rdp->rcu_iw_pending = false; + } + raw_spin_unlock_rcu_node(rnp); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Printing RCU CPU stall warnings + +#ifdef CONFIG_PREEMPT + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period on the specified rcu_node structure. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ + unsigned long flags; + struct task_struct *t; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + t = list_entry(rnp->gp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); + sched_show_task(t); + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ + struct task_struct *t; + int ndetected = 0; + + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return 0; + pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + rnp->level, rnp->grplo, rnp->grphi); + t = list_entry(rnp->gp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + pr_cont("\n"); + return ndetected; +} + +#else /* #ifdef CONFIG_PREEMPT */ + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +} + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ + return 0; +} +#endif /* #else #ifdef CONFIG_PREEMPT */ + +/* + * Dump stacks of all tasks running on stalled CPUs. First try using + * NMIs, but fall back to manual remote stack tracing on architectures + * that don't support NMI-based stack dumps. The NMI-triggered stack + * traces are more accurate because they are printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(void) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + +#ifdef CONFIG_RCU_FAST_NO_HZ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + + sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", + rdp->last_accelerate & 0xffff, jiffies & 0xffff, + ".l"[rdp->all_lazy], + ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], + ".D"[!!rdp->tick_nohz_enabled_snap]); +} + +#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + *cp = '\0'; +} + +#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period, then + * print the number of scheduling clock interrupts the CPU has taken + * during the time that it has been aware. Otherwise, print the number + * of RCU grace periods that this CPU is ignorant of, for example, "1" + * if the CPU was aware of the previous grace period. + * + * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + */ +static void print_cpu_stall_info(int cpu) +{ + unsigned long delta; + char fast_no_hz[72]; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + char *ticks_title; + unsigned long ticks_value; + + /* + * We could be printing a lot while holding a spinlock. Avoid + * triggering hard lockup. + */ + touch_nmi_watchdog(); + + ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); + if (ticks_value) { + ticks_title = "GPs behind"; + } else { + ticks_title = "ticks this GP"; + ticks_value = rdp->ticks_this_gp; + } + print_cpu_stall_fast_no_hz(fast_no_hz, cpu); + delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], + "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : + rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : + "!."[!delta], + ticks_value, ticks_title, + rcu_dynticks_snap(rdp) & 0xfff, + rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, + rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + fast_no_hz); +} + +/* Complain about starvation of grace-period kthread. */ +static void rcu_check_gp_kthread_starvation(void) +{ + struct task_struct *gpk = rcu_state.gp_kthread; + unsigned long j; + + j = jiffies - READ_ONCE(rcu_state.gp_activity); + if (j > 2 * HZ) { + pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", + rcu_state.name, j, + (long)rcu_seq_current(&rcu_state.gp_seq), + READ_ONCE(rcu_state.gp_flags), + gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, + gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); + if (gpk) { + pr_err("RCU grace-period kthread stack dump:\n"); + sched_show_task(gpk); + wake_up_process(gpk); + } + } +} + +static void print_other_cpu_stall(unsigned long gp_seq) +{ + int cpu; + unsigned long flags; + unsigned long gpa; + unsigned long j; + int ndetected = 0; + struct rcu_node *rnp; + long totqlen = 0; + + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(); + if (rcu_cpu_stall_suppress) + return; + + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ + pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); + rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + ndetected += rcu_print_task_stall(rnp); + if (rnp->qsmask != 0) { + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(cpu); + ndetected++; + } + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + + for_each_possible_cpu(cpu) + totqlen += rcu_get_n_cbs_cpu(cpu); + pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", + smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + if (ndetected) { + rcu_dump_cpu_stacks(); + + /* Complain about tasks blocking the grace period. */ + rcu_for_each_leaf_node(rnp) + rcu_print_detail_task_stall_rnp(rnp); + } else { + if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { + pr_err("INFO: Stall ended before state dump start\n"); + } else { + j = jiffies; + gpa = READ_ONCE(rcu_state.gp_activity); + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", + rcu_state.name, j - gpa, j, gpa, + READ_ONCE(jiffies_till_next_fqs), + rcu_get_root()->qsmask); + /* In this case, the current CPU might be at fault. */ + sched_show_task(current); + } + } + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + + rcu_check_gp_kthread_starvation(); + + panic_on_rcu_stall(); + + rcu_force_quiescent_state(); /* Kick them all. */ +} + +static void print_cpu_stall(void) +{ + int cpu; + unsigned long flags; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rcu_get_root(); + long totqlen = 0; + + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(); + if (rcu_cpu_stall_suppress) + return; + + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ + pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); + raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); + print_cpu_stall_info(smp_processor_id()); + raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); + for_each_possible_cpu(cpu) + totqlen += rcu_get_n_cbs_cpu(cpu); + pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", + jiffies - rcu_state.gp_start, + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + + rcu_check_gp_kthread_starvation(); + + rcu_dump_cpu_stacks(); + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + panic_on_rcu_stall(); + + /* + * Attempt to revive the RCU machinery by forcing a context switch. + * + * A context switch would normally allow the RCU state machine to make + * progress and it could be we're stuck in kernel space without context + * switches for an entirely unreasonable amount of time. + */ + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + +static void check_cpu_stall(struct rcu_data *rdp) +{ + unsigned long gs1; + unsigned long gs2; + unsigned long gps; + unsigned long j; + unsigned long jn; + unsigned long js; + struct rcu_node *rnp; + + if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + !rcu_gp_in_progress()) + return; + rcu_stall_kick_kthreads(); + j = jiffies; + + /* + * Lots of memory barriers to reject false positives. + * + * The idea is to pick up rcu_state.gp_seq, then + * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally + * another copy of rcu_state.gp_seq. These values are updated in + * the opposite order with memory barriers (or equivalent) during + * grace-period initialization and cleanup. Now, a false positive + * can occur if we get an new value of rcu_state.gp_start and a old + * value of rcu_state.jiffies_stall. But given the memory barriers, + * the only way that this can happen is if one grace period ends + * and another starts between these two fetches. This is detected + * by comparing the second fetch of rcu_state.gp_seq with the + * previous fetch from rcu_state.gp_seq. + * + * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, + * and rcu_state.gp_start suffice to forestall false positives. + */ + gs1 = READ_ONCE(rcu_state.gp_seq); + smp_rmb(); /* Pick up ->gp_seq first... */ + js = READ_ONCE(rcu_state.jiffies_stall); + smp_rmb(); /* ...then ->jiffies_stall before the rest... */ + gps = READ_ONCE(rcu_state.gp_start); + smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ + gs2 = READ_ONCE(rcu_state.gp_seq); + if (gs1 != gs2 || + ULONG_CMP_LT(j, js) || + ULONG_CMP_GE(gps, js)) + return; /* No stall or GP completed since entering function. */ + rnp = rdp->mynode; + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + if (rcu_gp_in_progress() && + (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(); + + } else if (rcu_gp_in_progress() && + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(gs2); + } +} + +////////////////////////////////////////////////////////////////////////////// +// +// RCU forward-progress mechanisms, including of callback invocation. + + +/* + * Show the state of the grace-period kthreads. + */ +void show_rcu_gp_kthreads(void) +{ + int cpu; + unsigned long j; + unsigned long ja; + unsigned long jr; + unsigned long jw; + struct rcu_data *rdp; + struct rcu_node *rnp; + + j = jiffies; + ja = j - READ_ONCE(rcu_state.gp_activity); + jr = j - READ_ONCE(rcu_state.gp_req_activity); + jw = j - READ_ONCE(rcu_state.gp_wake_time); + pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", + rcu_state.name, gp_state_getname(rcu_state.gp_state), + rcu_state.gp_state, + rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, + ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), + (long)READ_ONCE(rcu_state.gp_seq), + (long)READ_ONCE(rcu_get_root()->gp_seq_needed), + READ_ONCE(rcu_state.gp_flags)); + rcu_for_each_node_breadth_first(rnp) { + if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) + continue; + pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", + rnp->grplo, rnp->grphi, (long)rnp->gp_seq, + (long)rnp->gp_seq_needed); + if (!rcu_is_leaf_node(rnp)) + continue; + for_each_leaf_node_possible_cpu(rnp, cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->gpwrap || + ULONG_CMP_GE(rcu_state.gp_seq, + rdp->gp_seq_needed)) + continue; + pr_info("\tcpu %d ->gp_seq_needed %ld\n", + cpu, (long)rdp->gp_seq_needed); + } + } + /* sched_show_task(rcu_state.gp_kthread); */ +} +EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); + +/* + * This function checks for grace-period requests that fail to motivate + * RCU to come out of its idle mode. + */ +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay) +{ + unsigned long flags; + unsigned long j; + struct rcu_node *rnp_root = rcu_get_root(); + static atomic_t warned = ATOMIC_INIT(0); + + if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) + return; + j = jiffies; /* Expensive access, and in common case don't get here. */ + if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || + atomic_read(&warned)) + return; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + j = jiffies; + if (rcu_gp_in_progress() || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || + atomic_read(&warned)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + /* Hold onto the leaf lock to make others see warned==1. */ + + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + j = jiffies; + if (rcu_gp_in_progress() || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + time_before(j, rcu_state.gp_req_activity + gpssdelay) || + time_before(j, rcu_state.gp_activity + gpssdelay) || + atomic_xchg(&warned, 1)) { + raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + WARN_ON(1); + if (rnp_root != rnp) + raw_spin_unlock_rcu_node(rnp_root); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + show_rcu_gp_kthreads(); +} + +/* + * Do a forward-progress check for rcutorture. This is normally invoked + * due to an OOM event. The argument "j" gives the time period during + * which rcutorture would like progress to have been made. + */ +void rcu_fwd_progress_check(unsigned long j) +{ + unsigned long cbs; + int cpu; + unsigned long max_cbs = 0; + int max_cpu = -1; + struct rcu_data *rdp; + + if (rcu_gp_in_progress()) { + pr_info("%s: GP age %lu jiffies\n", + __func__, jiffies - rcu_state.gp_start); + show_rcu_gp_kthreads(); + } else { + pr_info("%s: Last GP end %lu jiffies ago\n", + __func__, jiffies - rcu_state.gp_end); + preempt_disable(); + rdp = this_cpu_ptr(&rcu_data); + rcu_check_gp_start_stall(rdp->mynode, rdp, j); + preempt_enable(); + } + for_each_possible_cpu(cpu) { + cbs = rcu_get_n_cbs_cpu(cpu); + if (!cbs) + continue; + if (max_cpu < 0) + pr_info("%s: callbacks", __func__); + pr_cont(" %d: %lu", cpu, cbs); + if (cbs <= max_cbs) + continue; + max_cbs = cbs; + max_cpu = cpu; + } + if (max_cpu >= 0) + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); + +/* Commandeer a sysrq key to dump RCU's tree. */ +static bool sysrq_rcu; +module_param(sysrq_rcu, bool, 0444); + +/* Dump grace-period-request information due to commandeered sysrq. */ +static void sysrq_show_rcu(int key) +{ + show_rcu_gp_kthreads(); +} + +static struct sysrq_key_op sysrq_rcudump_op = { + .handler = sysrq_show_rcu, + .help_msg = "show-rcu(y)", + .action_msg = "Show RCU tree", + .enable_mask = SYSRQ_ENABLE_DUMP, +}; + +static int __init rcu_sysrq_init(void) +{ + if (sysrq_rcu) + return register_sysrq_key('y', &sysrq_rcudump_op); + return 0; +} +early_initcall(rcu_sysrq_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1971869c4072..c3bf44ba42e5 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -1,26 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2001 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf @@ -52,6 +39,7 @@ #include <linux/tick.h> #include <linux/rcupdate_wait.h> #include <linux/sched/isolation.h> +#include <linux/kprobes.h> #define CREATE_TRACE_POINTS @@ -249,6 +237,7 @@ int notrace debug_lockdep_rcu_enabled(void) current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); +NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); /** * rcu_read_lock_held() - might we be in RCU read-side critical section? @@ -435,68 +424,11 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); #endif #ifdef CONFIG_RCU_STALL_COMMON - -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA 0 -#endif - int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); -static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; - module_param(rcu_cpu_stall_suppress, int, 0644); +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); - -int rcu_jiffies_till_stall_check(void) -{ - int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); - - /* - * Limit check must be consistent with the Kconfig limits - * for CONFIG_RCU_CPU_STALL_TIMEOUT. - */ - if (till_stall_check < 3) { - WRITE_ONCE(rcu_cpu_stall_timeout, 3); - till_stall_check = 3; - } else if (till_stall_check > 300) { - WRITE_ONCE(rcu_cpu_stall_timeout, 300); - till_stall_check = 300; - } - return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} -EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); - -void rcu_sysrq_start(void) -{ - if (!rcu_cpu_stall_suppress) - rcu_cpu_stall_suppress = 2; -} - -void rcu_sysrq_end(void) -{ - if (rcu_cpu_stall_suppress == 2) - rcu_cpu_stall_suppress = 0; -} - -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ - rcu_cpu_stall_suppress = 1; - return NOTIFY_DONE; -} - -static struct notifier_block rcu_panic_block = { - .notifier_call = rcu_panic, -}; - -static int __init check_cpu_stall_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); - return 0; -} -early_initcall(check_cpu_stall_init); - #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ #ifdef CONFIG_TASKS_RCU diff --git a/kernel/relay.c b/kernel/relay.c index 9e0f52375487..ade14fb7ce2e 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1177,7 +1177,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, diff --git a/kernel/resource.c b/kernel/resource.c index 915c02e8e5dd..8c15f846e8ef 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -382,7 +382,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, int (*func)(struct resource *, void *)) { struct resource res; - int ret = -1; + int ret = -EINVAL; while (start < end && !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { @@ -448,12 +448,13 @@ int walk_mem_res(u64 start, u64 end, void *arg, arg, func); } -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) - /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * It is to be used only for System RAM. + * + * This will find System RAM ranges that are children of top-level resources + * in addition to top-level System RAM resources. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -462,14 +463,14 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, unsigned long flags; struct resource res; unsigned long pfn, end_pfn; - int ret = -1; + int ret = -EINVAL; start = (u64) start_pfn << PAGE_SHIFT; end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; while (start < end && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, - true, &res)) { + false, &res)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) @@ -481,8 +482,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, return ret; } -#endif - static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; @@ -521,21 +520,20 @@ EXPORT_SYMBOL_GPL(page_is_ram); int region_intersects(resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - resource_size_t end = start + size - 1; + struct resource res; int type = 0; int other = 0; struct resource *p; + res.start = start; + res.end = start + size - 1; + read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { bool is_type = (((p->flags & flags) == flags) && ((desc == IORES_DESC_NONE) || (desc == p->desc))); - if (start >= p->start && start <= p->end) - is_type ? type++ : other++; - if (end >= p->start && end <= p->end) - is_type ? type++ : other++; - if (p->start >= start && p->end <= end) + if (resource_overlaps(p, &res)) is_type ? type++ : other++; } read_unlock(&resource_lock); @@ -1132,6 +1130,15 @@ struct resource * __request_region(struct resource *parent, conflict = __request_resource(parent, res); if (!conflict) break; + /* + * mm/hmm.c reserves physical addresses which then + * become unavailable to other users. Conflicts are + * not expected. Warn to aid debugging if encountered. + */ + if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { + pr_warn("Unaddressable device %s %pR conflicts with %pR", + conflict->name, conflict, res); + } if (conflict != parent) { if (!(conflict->flags & IORESOURCE_BUSY)) { parent = conflict; diff --git a/kernel/rseq.c b/kernel/rseq.c index 25e9a7b60eba..9424ee90589e 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs) * - signal delivery, * and return to user-space. * - * This is how we can ensure that the entire rseq critical section, - * consisting of both the C part and the assembly instruction sequence, + * This is how we can ensure that the entire rseq critical section * will issue the commit instruction only if executed atomically with * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. @@ -314,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; - if (current->rseq_len != rseq_len) + if (rseq_len != sizeof(*rseq)) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; @@ -322,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (ret) return ret; current->rseq = NULL; - current->rseq_len = 0; current->rseq_sig = 0; return 0; } @@ -336,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, * the provided address differs from the prior * one. */ - if (current->rseq != rseq || current->rseq_len != rseq_len) + if (current->rseq != rseq || rseq_len != sizeof(*rseq)) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; @@ -354,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; - current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8d76a65cfdd..102dfcf0a29a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * [L] ->on_rq * RELEASE (rq->lock) * - * If we observe the old CPU in task_rq_lock, the acquire of + * If we observe the old CPU in task_rq_lock(), the acquire of * the old rq->lock will fully serialize against the stores. * - * If we observe the new CPU in task_rq_lock, the acquire will - * pair with the WMB to ensure we must then also see migrating. + * If we observe the new CPU in task_rq_lock(), the address + * dependency headed by '[L] rq = task_rq()' and the acquire + * will pair with the WMB to ensure we then also see migrating. */ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { rq_pin_lock(rq, rf); @@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif + update_rq_clock_pelt(rq, delta); } void update_rq_clock(struct rq *rq) @@ -396,19 +398,7 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif -/** - * wake_q_add() - queue a wakeup for 'later' waking. - * @head: the wake_q_head to add @task to - * @task: the task to queue for 'later' wakeup - * - * Queue a task for later wakeup, most likely by the wake_up_q() call in the - * same context, _HOWEVER_ this is not guaranteed, the wakeup can come - * instantly. - * - * This function must be used as-if it were wake_up_process(); IOW the task - * must be ready to be woken at this location. - */ -void wake_q_add(struct wake_q_head *head, struct task_struct *task) +static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { struct wake_q_node *node = &task->wake_q; @@ -421,16 +411,56 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * state, even in the failed case, an explicit smp_mb() must be used. */ smp_mb__before_atomic(); - if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)) - return; - - get_task_struct(task); + if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) + return false; /* * The head is context local, there can be no concurrency. */ *head->lastp = node; head->lastp = &node->next; + return true; +} + +/** + * wake_q_add() - queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + */ +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + if (__wake_q_add(head, task)) + get_task_struct(task); +} + +/** + * wake_q_add_safe() - safely queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + * + * This function is essentially a task-safe equivalent to wake_q_add(). Callers + * that already hold reference to @task can call the 'safe' version and trust + * wake_q to do the right thing depending whether or not the @task is already + * queued for wakeup. + */ +void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) +{ + if (!__wake_q_add(head, task)) + put_task_struct(task); } void wake_up_q(struct wake_q_head *head) @@ -762,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); + + p->on_rq = TASK_ON_RQ_QUEUED; } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { + p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; + if (task_contributes_to_load(p)) rq->nr_uninterruptible++; @@ -890,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) } /* - * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) @@ -928,7 +962,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, { lockdep_assert_held(&rq->lock); - p->on_rq = TASK_ON_RQ_MIGRATING; + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); dequeue_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); @@ -1121,7 +1155,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); return 0; } else if (task_on_rq_queued(p)) { /* @@ -1207,11 +1240,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); @@ -1651,16 +1682,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) __schedstat_inc(p->se.statistics.nr_wakeups_sync); } -static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) -{ - activate_task(rq, p, en_flags); - p->on_rq = TASK_ON_RQ_QUEUED; - - /* If a worker is waking up, notify the workqueue: */ - if (p->flags & PF_WQ_WORKER) - wq_worker_waking_up(p, cpu_of(rq)); -} - /* * Mark the task runnable and perform wakeup-preemption. */ @@ -1712,7 +1733,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, en_flags |= ENQUEUE_MIGRATED; #endif - ttwu_activate(rq, p, en_flags); + activate_task(rq, p, en_flags); ttwu_do_wakeup(rq, p, wake_flags, rf); } @@ -2077,56 +2098,6 @@ out: } /** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * @rf: request-queue flags for pinning - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) -{ - struct rq *rq = task_rq(p); - - if (WARN_ON_ONCE(rq != this_rq()) || - WARN_ON_ONCE(p == current)) - return; - - lockdep_assert_held(&rq->lock); - - if (!raw_spin_trylock(&p->pi_lock)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet picked a replacement task. - */ - rq_unlock(rq, rf); - raw_spin_lock(&p->pi_lock); - rq_relock(rq, rf); - } - - if (!(p->state & TASK_NORMAL)) - goto out; - - trace_sched_waking(p); - - if (!task_on_rq_queued(p)) { - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&rq->nr_iowait); - } - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); - } - - ttwu_do_wakeup(rq, p, 0, rf); - ttwu_stat(p, smp_processor_id(), 0); -out: - raw_spin_unlock(&p->pi_lock); -} - -/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2190,6 +2161,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif +#ifdef CONFIG_COMPACTION + p->capture_control = NULL; +#endif init_numa_balancing(clone_flags, p); } @@ -2431,10 +2405,9 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); - post_init_entity_util_avg(&p->se); + post_init_entity_util_avg(p); activate_task(rq, p, ENQUEUE_NOCLOCK); - p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -3433,25 +3406,11 @@ static void __sched notrace __schedule(bool preempt) prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); - prev->on_rq = 0; if (prev->in_iowait) { atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } - - /* - * If a worker went to sleep, notify and ask workqueue - * whether it wants to wake up a task to maintain - * concurrency. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev); - if (to_wakeup) - try_to_wake_up_local(to_wakeup, &rf); - } } switch_count = &prev->nvcsw; } @@ -3511,6 +3470,20 @@ static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) return; + + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. + * As this function is called inside the schedule() context, + * we disable preemption to avoid it calling schedule() again + * in the possible wakeup of a kworker. + */ + if (tsk->flags & PF_WQ_WORKER) { + preempt_disable(); + wq_worker_sleeping(tsk); + preempt_enable_no_resched(); + } + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -3519,6 +3492,12 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } +static void sched_update_worker(struct task_struct *tsk) +{ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); +} + asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; @@ -3529,6 +3508,7 @@ asmlinkage __visible void __sched schedule(void) __schedule(false); sched_preempt_enable_no_resched(); } while (need_resched()); + sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -5265,9 +5245,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, - compat_pid_t, pid, - struct old_timespec32 __user *, interval) +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); @@ -5867,14 +5846,11 @@ void __init sched_init_smp(void) /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot - * happen. The hotplug lock is nevertheless taken to satisfy lockdep, - * but there won't be any contention on it. + * happen. */ - cpus_read_lock(); mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); - cpus_read_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) @@ -5889,7 +5865,7 @@ void __init sched_init_smp(void) static int __init migration_init(void) { - sched_rq_cpu_starting(smp_processor_id()); + sched_cpu_starting(smp_processor_id()); return 0; } early_initcall(migration_init); @@ -6162,6 +6138,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL(___might_sleep); + +void __cant_sleep(const char *file, int line, int preempt_offset) +{ + static unsigned long prev_jiffy; + + if (irqs_disabled()) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + if (preempt_count() > preempt_offset) + return; + + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); + printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} +EXPORT_SYMBOL_GPL(__cant_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -6502,6 +6506,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + if (shareval > scale_load_down(ULONG_MAX)) + shareval = MAX_SHARES; return sched_group_set_shares(css_tg(css), scale_load(shareval)); } @@ -6517,7 +6523,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -6597,20 +6603,22 @@ out_unlock: return ret; } -int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { u64 quota, period; period = ktime_to_ns(tg->cfs_bandwidth.period); if (cfs_quota_us < 0) quota = RUNTIME_INF; - else + else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) quota = (u64)cfs_quota_us * NSEC_PER_USEC; + else + return -EINVAL; return tg_set_cfs_bandwidth(tg, period, quota); } -long tg_get_cfs_quota(struct task_group *tg) +static long tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; @@ -6623,17 +6631,20 @@ long tg_get_cfs_quota(struct task_group *tg) return quota_us; } -int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { u64 quota, period; + if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + period = (u64)cfs_period_us * NSEC_PER_USEC; quota = tg->cfs_bandwidth.quota; return tg_set_cfs_bandwidth(tg, period, quota); } -long tg_get_cfs_period(struct task_group *tg) +static long tg_get_cfs_period(struct task_group *tg) { u64 cfs_period_us; @@ -6941,7 +6952,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, { char tok[21]; /* U64_MAX */ - if (!sscanf(buf, "%s %llu", tok, periodp)) + if (sscanf(buf, "%20s %llu", tok, periodp) < 1) return -EINVAL; *periodp *= NSEC_PER_USEC; diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 22bd8980f32f..b5dcd1d83c7f 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -7,7 +7,7 @@ */ #include "sched.h" -DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. @@ -48,8 +48,8 @@ EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); * * Clear the update_util_data pointer for the given CPU. * - * Callers must use RCU-sched callbacks to free any memory that might be - * accessed via the old update_util_data pointer or invoke synchronize_sched() + * Callers must use RCU callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_rcu() * right after this function to avoid use-after-free. */ void cpufreq_remove_update_util_hook(int cpu) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 033ec7c45f13..962cf343f798 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,8 @@ #include <linux/sched/cpufreq.h> #include <trace/events/power.h> +#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -48,7 +50,6 @@ struct sugov_cpu { bool iowait_boost_pending; unsigned int iowait_boost; - unsigned int iowait_boost_max; u64 last_update; unsigned long bw_dl; @@ -291,8 +292,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * * The IO wait boost of a task is disabled after a tick since the last update * of a CPU. If a new IO wait boost is requested after more then a tick, then - * we enable the boost starting from the minimum frequency, which improves - * energy efficiency by ignoring sporadic wakeups from IO. + * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy + * efficiency by ignoring sporadic wakeups from IO. */ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, bool set_iowait_boost) @@ -303,8 +304,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, if (delta_ns <= TICK_NSEC) return false; - sg_cpu->iowait_boost = set_iowait_boost - ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; sg_cpu->iowait_boost_pending = set_iowait_boost; return true; @@ -318,8 +318,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, * * Each time a task wakes up after an IO operation, the CPU utilization can be * boosted to a certain utilization which doubles at each "frequent and - * successive" wakeup from IO, ranging from the utilization of the minimum - * OPP to the utilization of the maximum OPP. + * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization + * of the maximum OPP. + * * To keep doubling, an IO boost has to be requested at least once per tick, * otherwise we restart from the utilization of the minimum OPP. */ @@ -344,14 +345,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, /* Double the boost at each request */ if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + sg_cpu->iowait_boost = + min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); return; } /* First wakeup after IO: start with minimum boost */ - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; } /** @@ -373,47 +373,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long *util, unsigned long *max) +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long util, unsigned long max) { - unsigned int boost_util, boost_max; + unsigned long boost; /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return util; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return util; - /* - * An IO waiting task has just woken up: - * allow to further double the boost value - */ - if (sg_cpu->iowait_boost_pending) { - sg_cpu->iowait_boost_pending = false; - } else { + if (!sg_cpu->iowait_boost_pending) { /* - * Otherwise: reduce the boost value and disable it when we - * reach the minimum. + * No boost pending; reduce the boost value. */ sg_cpu->iowait_boost >>= 1; - if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return; + return util; } } + sg_cpu->iowait_boost_pending = false; + /* - * Apply the current boost value: a CPU is boosted only if its current - * utilization is smaller then the current IO boost level. + * @util is already in capacity scale; convert iowait_boost + * into the same scale so we can compare. */ - boost_util = sg_cpu->iowait_boost; - boost_max = sg_cpu->iowait_boost_max; - if (*util * boost_max < *max * boost_util) { - *util = boost_util; - *max = boost_max; - } + boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; + return max(boost, util); } #ifdef CONFIG_NO_HZ_COMMON @@ -460,7 +451,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, util = sugov_get_util(sg_cpu); max = sg_cpu->max; - sugov_iowait_apply(sg_cpu, time, &util, &max); + util = sugov_iowait_apply(sg_cpu, time, util, max); next_f = get_next_freq(sg_policy, util, max); /* * Do not reduce the frequency if the CPU has not been idle @@ -500,7 +491,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) j_util = sugov_get_util(j_sg_cpu); j_max = j_sg_cpu->max; - sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); + j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); if (j_util * max > j_max * util) { util = j_util; @@ -609,13 +600,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); -static struct attribute *sugov_attributes[] = { +static struct attribute *sugov_attrs[] = { &rate_limit_us.attr, NULL }; +ATTRIBUTE_GROUPS(sugov); static struct kobj_type sugov_tunables_ktype = { - .default_attrs = sugov_attributes, + .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, }; @@ -782,6 +774,7 @@ out: return 0; fail: + kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; sugov_tunables_free(tunables); @@ -837,7 +830,6 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } for_each_cpu(cpu, policy->cpus) { @@ -859,7 +851,7 @@ static void sugov_stop(struct cpufreq_policy *policy) for_each_cpu(cpu, policy->cpus) cpufreq_remove_update_util_hook(cpu); - synchronize_sched(); + synchronize_rcu(); if (!policy->fast_switch_enabled) { irq_work_sync(&sg_policy->irq_work); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb8b7b5d745d..43901fa3f269 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p) if (dl_entity_is_special(dl_se)) return; - WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); zerolag_time = dl_se->deadline - @@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p) * If the "0-lag time" already passed, decrease the active * utilization now, instead of starting a timer */ - if (zerolag_time < 0) { + if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { @@ -1767,7 +1766,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) deadline_queue_push_tasks(rq); if (rq->curr->sched_class != &dl_sched_class) - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; } @@ -1776,7 +1775,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1793,7 +1792,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); /* * Even when we have runtime, update_curr_dl() might have resulted in us * not being the leftmost task anymore. In that case NEED_RESCHED will diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index de3de997e245..678bfb9bd87f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void) { static struct ctl_table *cpu_entries; static struct ctl_table **cpu_idx; + static bool init_done = false; char buf[32]; int i; @@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void) if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; + } + if (!init_done) { + init_done = true; /* init to possible to not have holes in @cpu_entries */ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); } @@ -698,7 +702,7 @@ do { \ static const char *sched_tunable_scaling_names[] = { "none", - "logaritmic", + "logarithmic", "linear" }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 310d0637fe4b..f35930f5e528 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -248,13 +248,6 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED - -/* cpu runqueue to which this cfs_rq is attached */ -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rq; -} - static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); @@ -282,79 +275,103 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { - if (!cfs_rq->on_list) { - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + + if (cfs_rq->on_list) + return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list; + + cfs_rq->on_list = 1; + + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the top of the tree + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { /* - * Ensure we either appear before our parent (if already - * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases and a special case for the root - * cfs_rq. Furthermore, it also means that we will always reset - * tmp_alone_branch either when the branch is connected - * to a tree or when we reach the beg of the tree + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. */ - if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { - /* - * If parent is already on the list, we add the child - * just before. Thanks to circular linked property of - * the list, this means to put the child at the tail - * of the list that starts by parent. - */ - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); - /* - * The branch is now connected to its tree so we can - * reset tmp_alone_branch to the beginning of the - * list. - */ - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - } else if (!cfs_rq->tg->parent) { - /* - * cfs rq without parent should be put - * at the tail of the list. - */ - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq->leaf_cfs_rq_list); - /* - * We have reach the beg of a tree so we can reset - * tmp_alone_branch to the beginning of the list. - */ - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - } else { - /* - * The parent has not already been added so we want to - * make sure that it will be put after us. - * tmp_alone_branch points to the beg of the branch - * where we will add parent. - */ - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - rq->tmp_alone_branch); - /* - * update tmp_alone_branch to points to the new beg - * of the branch - */ - rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; - } + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + return true; + } - cfs_rq->on_list = 1; + if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + /* + * We have reach the top of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + return true; } + + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the begin of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new begin + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; + return false; } static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + + /* + * With cfs_rq being unthrottled/throttled during an enqueue, + * it can happen the tmp_alone_branch points the a leaf that + * we finally want to del. In this case, tmp_alone_branch moves + * to the prev element but it will point to rq->leaf_cfs_rq_list + * at the end of the enqueue. + */ + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) + rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); cfs_rq->on_list = 0; } } -/* Iterate through all leaf cfs_rq's on a runqueue: */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +static inline void assert_list_leaf_cfs_rq(struct rq *rq) +{ + SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -410,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se) return container_of(se, struct task_struct, se); } -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return container_of(cfs_rq, struct rq, cfs); -} - - #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -438,16 +449,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return NULL; } -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { + return true; } static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +static inline void assert_list_leaf_cfs_rq(struct rq *rq) +{ +} + +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -686,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -#ifdef CONFIG_SMP #include "pelt.h" -#include "sched-pelt.h" +#ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); @@ -744,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se); * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) * if util_avg > util_avg_cap. */ -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); @@ -763,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - if (entity_is_task(se)) { - struct task_struct *p = task_of(se); - if (p->sched_class != &fair_sched_class) { - /* - * For !fair tasks do: - * - update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); - switched_from_fair(rq, p); - * - * such that the next switched_to_fair() has the - * expected state. - */ - se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); - return; - } + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq); + attach_entity_load_avg(cfs_rq, se, 0); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); + return; } attach_entity_cfs_rq(se); @@ -788,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se) void init_entity_runnable_average(struct sched_entity *se) { } -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p) { } static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) @@ -1035,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256; unsigned int sysctl_numa_balancing_scan_delay = 1000; struct numa_group { - atomic_t refcount; + refcount_t refcount; spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; @@ -1104,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p) unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); - period *= atomic_read(&ng->refcount); + period *= refcount_read(&ng->refcount); period *= shared + 1; period /= private + shared + 1; } @@ -1127,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p) unsigned long private = group_faults_priv(ng); unsigned long period = smax; - period *= atomic_read(&ng->refcount); + period *= refcount_read(&ng->refcount); period *= shared + 1; period /= private + shared + 1; @@ -1160,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = -1; + p->numa_preferred_nid = NUMA_NO_NODE; return; } @@ -1180,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); } static void account_numa_dequeue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } @@ -1400,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * two full passes of the "multi-stage node selection" test that is * executed below. */ - if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) return true; @@ -1848,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1994,6 +2007,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) if (p->last_task_numa_placement) { delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; + + /* Avoid time going backwards, prevent potential divide error: */ + if (unlikely((s64)*period < 0)) + *period = 0; } else { delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX; @@ -2095,7 +2112,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1; + int seq, nid, max_nid = NUMA_NO_NODE; unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; @@ -2203,12 +2220,12 @@ static void task_numa_placement(struct task_struct *p) static inline int get_numa_group(struct numa_group *grp) { - return atomic_inc_not_zero(&grp->refcount); + return refcount_inc_not_zero(&grp->refcount); } static inline void put_numa_group(struct numa_group *grp) { - if (atomic_dec_and_test(&grp->refcount)) + if (refcount_dec_and_test(&grp->refcount)) kfree_rcu(grp, rcu); } @@ -2229,7 +2246,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!grp) return; - atomic_set(&grp->refcount, 1); + refcount_set(&grp->refcount, 1); grp->active_nodes = 1; grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); @@ -2580,7 +2597,7 @@ out: /* * Drive the periodic memory faults.. */ -void task_tick_numa(struct rq *rq, struct task_struct *curr) +static void task_tick_numa(struct rq *rq, struct task_struct *curr) { struct callback_head *work = &curr->numa_work; u64 period, now; @@ -2638,7 +2655,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) * the preferred node. */ if (dst_nid == p->numa_preferred_nid || - (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + (p->numa_preferred_nid != NUMA_NO_NODE && + src_nid != p->numa_preferred_nid)) return; } @@ -3122,7 +3140,7 @@ void set_task_rq_fair(struct sched_entity *se, p_last_update_time = prev->avg.last_update_time; n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time = n_last_update_time; } @@ -3257,11 +3275,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* * runnable_sum can't be lower than running_sum - * As running sum is scale with CPU capacity wehreas the runnable sum - * is not we rescale running_sum 1st + * Rescale running sum to be in the same range as runnable sum + * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT] + * runnable_sum is in [0 : LOAD_AVG_MAX] */ - running_sum = se->avg.util_sum / - arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); load_sum = (s64)se_weight(se) * runnable_sum; @@ -3364,7 +3382,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages - * @now: current time, as per cfs_rq_clock_task() + * @now: current time, as per cfs_rq_clock_pelt() * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) @@ -3409,7 +3427,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) decayed = 1; } - decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); + decayed |= __update_load_avg_cfs_rq(now, cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3499,9 +3517,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 now = cfs_rq_clock_task(cfs_rq); - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); + u64 now = cfs_rq_clock_pelt(cfs_rq); int decayed; /* @@ -3509,7 +3525,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * track group sched_entity load average for task_h_load calc in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) - __update_load_avg_se(now, cpu, cfs_rq, se); + __update_load_avg_se(now, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq); decayed |= propagate_entity_load_avg(se); @@ -3555,20 +3571,20 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) * Synchronize entity load avg of dequeued entity without locking * the previous rq. */ -void sync_entity_load_avg(struct sched_entity *se) +static void sync_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); + __update_load_avg_blocked_se(last_update_time, se); } /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ -void remove_entity_load_avg(struct sched_entity *se) +static void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); unsigned long flags; @@ -3577,10 +3593,6 @@ void remove_entity_load_avg(struct sched_entity *se) * tasks cannot exit without having gone through wake_up_new_task() -> * post_init_entity_util_avg() which will have added things to the * cfs_rq, so we can remove unconditionally. - * - * Similarly for groups, they will have passed through - * post_init_entity_util_avg() before unregister_sched_fair_group() - * calls this. */ sync_entity_load_avg(se); @@ -3654,6 +3666,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) { long last_ewma_diff; struct util_est ue; + int cpu; if (!sched_feat(UTIL_EST)) return; @@ -3688,6 +3701,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) return; /* + * To avoid overestimation of actual task utilization, skip updates if + * we cannot grant there is idle time in this CPU. + */ + cpu = cpu_of(rq_of(cfs_rq)); + if (task_util(p) > capacity_orig_of(cpu)) + return; + + /* * Update Task's estimated utilization * * When *p completes an activation we can consolidate another sample @@ -4429,6 +4450,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; + + /* Add cfs_rq with already running entity in the list */ + if (cfs_rq->nr_running >= 1) + list_add_leaf_cfs_rq(cfs_rq); } return 0; @@ -4440,8 +4465,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) + if (!cfs_rq->throttle_count) { cfs_rq->throttled_clock_task = rq_clock_task(rq); + list_del_leaf_cfs_rq(cfs_rq); + } cfs_rq->throttle_count++; return 0; @@ -4544,6 +4571,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) break; } + assert_list_leaf_cfs_rq(rq); + if (!se) add_nr_running(rq, task_delta); @@ -4565,7 +4594,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, struct rq *rq = rq_of(cfs_rq); struct rq_flags rf; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4582,7 +4611,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); if (!remaining) break; @@ -4598,7 +4627,7 @@ next: * period the timer is deactivated until scheduling resumes; cfs_b->idle is * used to track this state. */ -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { u64 runtime, runtime_expires; int throttled; @@ -4640,11 +4669,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; cfs_b->distribute_running = 1; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); @@ -4753,17 +4782,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + unsigned long flags; u64 expires; /* confirm we're still not at a refresh boundary */ - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); if (cfs_b->distribute_running) { - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; } @@ -4774,18 +4804,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (runtime) cfs_b->distribute_running = 1; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) return; runtime = distribute_cfs_runtime(cfs_b, runtime, expires); - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); if (expires == cfs_b->runtime_expires) lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } /* @@ -4859,24 +4889,50 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } +extern const u64 max_cfs_quota_period; + static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); + unsigned long flags; int overrun; int idle = 0; + int count = 0; - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); for (;;) { overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; - idle = do_sched_cfs_period_timer(cfs_b, overrun); + if (++count > 3) { + u64 new, old = ktime_to_ns(cfs_b->period); + + new = (old * 147) / 128; /* ~115% */ + new = min(new, max_cfs_quota_period); + + cfs_b->period = ns_to_ktime(new); + + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ + cfs_b->quota *= new; + cfs_b->quota = div64_u64(cfs_b->quota, old); + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + + /* reset count so we don't come right back in here */ + count = 0; + } + + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -4986,6 +5042,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ + +static inline bool cfs_bandwidth_used(void) +{ + return false; +} + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { return rq_clock_task(rq_of(cfs_rq)); @@ -5083,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline unsigned long cpu_util(int cpu); -static unsigned long capacity_of(int cpu); static inline bool cpu_overutilized(int cpu) { @@ -5177,6 +5238,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } + if (cfs_bandwidth_used()) { + /* + * When bandwidth control is enabled; the cfs_rq_throttled() + * breaks in the above iteration can result in incomplete + * leaf list maintenance, resulting in triggering the assertion + * below. + */ + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (list_add_leaf_cfs_rq(cfs_rq)) + break; + } + } + + assert_list_leaf_cfs_rq(rq); + hrtick_update(rq); } @@ -5556,11 +5634,6 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} - static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6053,7 +6126,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { - cpumask_clear_cpu(cpu, cpus); + __cpumask_clear_cpu(cpu, cpus); if (!available_idle_cpu(cpu)) idle = false; } @@ -6073,7 +6146,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_smt(struct task_struct *p, int target) { int cpu; @@ -6097,7 +6170,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_smt(struct task_struct *p, int target) { return -1; } @@ -6202,7 +6275,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, sd, target); + i = select_idle_smt(p, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -6608,7 +6681,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - if (static_branch_unlikely(&sched_energy_present)) { + if (sched_energy_enabled()) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -7027,6 +7100,12 @@ idle: if (new_tasks > 0) goto again; + /* + * rq is about to be idle, check if we need to update the + * lost_idle_time of clock_pelt + */ + update_idle_rq_clock_pelt(rq); + return NULL; } @@ -7441,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -7577,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p) BUG_ON(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -7647,10 +7724,27 @@ static inline bool others_have_blocked(struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->avg.runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; const struct sched_class *curr_class; struct rq_flags rf; bool done = true; @@ -7662,14 +7756,10 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0); /* Propagate pending load changes to the parent, if any: */ @@ -7677,14 +7767,21 @@ static void update_blocked_averages(int cpu) if (se && !skip_blocked_update(se)) update_load_avg(cfs_rq_of(se), se, 0); + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); + /* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq)) done = false; } curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ if (others_have_blocked(rq)) @@ -7713,10 +7810,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) if (cfs_rq->last_h_load_update == now) return; - cfs_rq->h_load_next = NULL; + WRITE_ONCE(cfs_rq->h_load_next, NULL); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - cfs_rq->h_load_next = se; + WRITE_ONCE(cfs_rq->h_load_next, se); if (cfs_rq->last_h_load_update == now) break; } @@ -7726,7 +7823,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) cfs_rq->last_h_load_update = now; } - while ((se = cfs_rq->h_load_next) != NULL) { + while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) { load = cfs_rq->h_load; load = div64_ul(load * se->avg.load_avg, cfs_rq_load_avg(cfs_rq) + 1); @@ -7754,11 +7851,11 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -7989,6 +8086,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) } /* + * Check whether a rq has a misfit task and if it looks like we can actually + * help that task: we can migrate the task to a CPU of higher capacity, or + * the task's current CPU is heavily pressured. + */ +static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +{ + return rq->misfit_task_load && + (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + check_cpu_capacity(rq, sd)); +} + +/* * Group imbalance indicates (and tries to solve) the problem where balancing * groups is inadequate due to ->cpus_allowed constraints. * @@ -8452,9 +8561,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) return 0; - env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, - SCHED_CAPACITY_SCALE); + env->imbalance = sds->busiest_stat.group_load; return 1; } @@ -8636,7 +8743,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (static_branch_unlikely(&sched_energy_present)) { + if (sched_energy_enabled()) { struct root_domain *rd = env->dst_rq->rd; if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) @@ -8827,21 +8934,25 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512 -static int need_active_balance(struct lb_env *env) +static inline bool +asym_active_balance(struct lb_env *env) { - struct sched_domain *sd = env->sd; + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. + */ + return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(env->dst_cpu, env->src_cpu); +} - if (env->idle == CPU_NEWLY_IDLE) { +static inline bool +voluntary_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; - /* - * ASYM_PACKING needs to force migrate tasks from busy but - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. - */ - if ((sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu)) - return 1; - } + if (asym_active_balance(env)) + return 1; /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. @@ -8859,6 +8970,16 @@ static int need_active_balance(struct lb_env *env) if (env->src_grp_type == group_misfit_task) return 1; + return 0; +} + +static int need_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + if (voluntary_active_balance(env)) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -9023,7 +9144,7 @@ more_balance: if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { /* Prevent to re-select dst_cpu via env's CPUs */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); + __cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; @@ -9050,7 +9171,7 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { - cpumask_clear_cpu(cpu_of(busiest), cpus); + __cpumask_clear_cpu(cpu_of(busiest), cpus); /* * Attempting to continue load balancing at the current * sched_domain level only makes sense if there are @@ -9120,7 +9241,7 @@ more_balance: } else sd->nr_balance_failed = 0; - if (likely(!active_balance)) { + if (likely(!active_balance) || voluntary_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; } else { @@ -9427,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. + * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + * anywhere yet. */ static inline int find_new_ilb(void) { - int ilb = cpumask_first(nohz.idle_cpus_mask); + int ilb; - if (ilb < nr_cpu_ids && idle_cpu(ilb)) - return ilb; + for_each_cpu_and(ilb, nohz.idle_cpus_mask, + housekeeping_cpumask(HK_FLAG_MISC)) { + if (idle_cpu(ilb)) + return ilb; + } return nr_cpu_ids; } /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick the - * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle - * CPU (if there is one). + * Kick a CPU to do the nohz balancing, if it is time for it. We pick any + * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -9469,15 +9594,8 @@ static void kick_ilb(unsigned int flags) } /* - * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu in the system. - * - This rq has more than one task. - * - This rq has at least one CFS task and the capacity of the CPU is - * significantly reduced because of RT tasks or IRQs. - * - At parent of LLC scheduler domain level, this cpu's scheduler group has - * multiple busy cpu. - * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler - * domain span are idle. + * Current decision point for kicking the idle load balancer in the presence + * of idle CPUs in the system. */ static void nohz_balancer_kick(struct rq *rq) { @@ -9510,30 +9628,21 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2 || rq->misfit_task_load) { + if (rq->nr_running >= 2) { flags = NOHZ_KICK_MASK; goto out; } rcu_read_lock(); - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) { - /* - * XXX: write a coherent comment on why we do this. - * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com - */ - nr_busy = atomic_read(&sds->nr_busy_cpus); - if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; - goto unlock; - } - - } sd = rcu_dereference(rq->sd); if (sd) { - if ((rq->cfs.h_nr_running >= 1) && - check_cpu_capacity(rq, sd)) { + /* + * If there's a CFS task and the current CPU has reduced + * capacity; kick the ILB to see if there's a better CPU to run + * on. + */ + if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_KICK_MASK; goto unlock; } @@ -9541,17 +9650,57 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (i == cpu || - !cpumask_test_cpu(i, nohz.idle_cpus_mask)) - continue; - + /* + * When ASYM_PACKING; see if there's a more preferred CPU + * currently idle; in which case, kick the ILB to move tasks + * around. + */ + for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; goto unlock; } } } + + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + if (sd) { + /* + * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU + * to run the misfit task on. + */ + if (check_misfit_status(rq, sd)) { + flags = NOHZ_KICK_MASK; + goto unlock; + } + + /* + * For asymmetric systems, we do not want to nicely balance + * cache use, instead we want to embrace asymmetry and only + * ensure tasks have enough CPU capacity. + * + * Skip the LLC logic because it's not relevant in that case. + */ + goto unlock; + } + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * If there is an imbalance between LLC domains (IOW we could + * increase the overall cache use), we need some less-loaded LLC + * domain to pull some load. Likewise, we may need to spread + * load within the current LLC domain (e.g. packed SMT cores but + * other CPUs are idle). We can't really know from here how busy + * the others are - so just get a nohz balance going if it looks + * like this LLC domain has tasks we could move. + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); + if (nr_busy > 1) { + flags = NOHZ_KICK_MASK; + goto unlock; + } + } unlock: rcu_read_unlock(); out: @@ -10546,10 +10695,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 81faddba9e20..687302051a27 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -65,6 +65,7 @@ void __init housekeeping_init(void) static int __init housekeeping_setup(char *str, enum hk_flags flags) { cpumask_var_t non_housekeeping_mask; + cpumask_var_t tmp; int err; alloc_bootmem_cpumask_var(&non_housekeeping_mask); @@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) return 0; } + alloc_bootmem_cpumask_var(&tmp); if (!housekeeping_flags) { alloc_bootmem_cpumask_var(&housekeeping_mask); cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask); - if (cpumask_empty(housekeeping_mask)) - cpumask_set_cpu(smp_processor_id(), housekeeping_mask); - } else { - cpumask_var_t tmp; - alloc_bootmem_cpumask_var(&tmp); + cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); + if (cpumask_empty(tmp)) { + pr_warn("Housekeeping: must include one present CPU, " + "using boot CPU:%d\n", smp_processor_id()); + __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); + } + } else { + cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); + if (cpumask_empty(tmp)) + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); if (!cpumask_equal(tmp, housekeeping_mask)) { pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); @@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) free_bootmem_cpumask_var(non_housekeeping_mask); return 0; } - free_bootmem_cpumask_var(tmp); } + free_bootmem_cpumask_var(tmp); if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 90fb5bc12ad4..befce29bd882 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -26,7 +26,6 @@ #include <linux/sched.h> #include "sched.h" -#include "sched-pelt.h" #include "pelt.h" /* @@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) * n=1 */ static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, +accumulate_sum(u64 delta, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { - unsigned long scale_freq, scale_cpu; u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - scale_freq = arch_scale_freq_capacity(cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - delta += sa->period_contrib; periods = delta / 1024; /* A period is 1024us (~1ms) */ @@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, } sa->period_contrib = delta; - contrib = cap_scale(contrib, scale_freq); if (load) sa->load_sum += load * contrib; if (runnable) sa->runnable_load_sum += runnable * contrib; if (running) - sa->util_sum += contrib * scale_cpu; + sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; return periods; } @@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -___update_load_sum(u64 now, int cpu, struct sched_avg *sa, +___update_load_sum(u64 now, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) + if (!accumulate_sum(delta, sa, load, runnable, running)) return 0; return 1; @@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * runnable_load_avg = \Sum se->avg.runable_load_avg */ -int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + if (___update_load_sum(now, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) return 0; } -int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, cfs_rq->curr == se)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); @@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e return 0; } -int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { - if (___update_load_sum(now, cpu, &cfs_rq->avg, + if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), scale_load_down(cfs_rq->runnable_weight), cfs_rq->curr != NULL)) { @@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) { - if (___update_load_sum(now, rq->cpu, &rq->avg_rt, + if (___update_load_sum(now, &rq->avg_rt, running, running, running)) { @@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) { - if (___update_load_sum(now, rq->cpu, &rq->avg_dl, + if (___update_load_sum(now, &rq->avg_dl, running, running, running)) { @@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) int update_irq_load_avg(struct rq *rq, u64 running) { int ret = 0; + + /* + * We can't use clock_pelt because irq time is not accounted in + * clock_task. Instead we directly scale the running time to + * reflect the real amount of computation + */ + running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); + running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + /* * We know the time that has been used by interrupt since last update * but we don't when. Let be pessimistic and assume that interrupt has * happened just before the update. This is not so far from reality * because interrupt will most probably wake up task and trig an update - * of rq clock during which the metric si updated. + * of rq clock during which the metric is updated. * We start to decay with normal context time and then we add the * interrupt context time. * We can safely remove running from rq->clock because * rq->clock += delta with delta >= running */ - ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, + ret = ___update_load_sum(rq->clock - running, &rq->avg_irq, 0, 0, 0); - ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, + ret += ___update_load_sum(rq->clock, &rq->avg_irq, 1, 1, 1); diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 7e56b489ff32..7489d5f56960 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -1,8 +1,9 @@ #ifdef CONFIG_SMP +#include "sched-pelt.h" -int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); -int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); -int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); +int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); +int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); +int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); @@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg) WRITE_ONCE(avg->util_est.enqueued, enqueued); } +/* + * The clock_pelt scales the time to reflect the effective amount of + * computation done during the running delta time but then sync back to + * clock_task when rq is idle. + * + * + * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16 + * @ max capacity ------******---------------******--------------- + * @ half capacity ------************---------************--------- + * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16 + * + */ +static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) +{ + if (unlikely(is_idle_task(rq->curr))) { + /* The rq is idle, we can sync to clock_task */ + rq->clock_pelt = rq_clock_task(rq); + return; + } + + /* + * When a rq runs at a lower compute capacity, it will need + * more time to do the same amount of work than at max + * capacity. In order to be invariant, we scale the delta to + * reflect how much work has been really done. + * Running longer results in stealing idle time that will + * disturb the load signal compared to max capacity. This + * stolen idle time will be automatically reflected when the + * rq will be idle and the clock will be synced with + * rq_clock_task. + */ + + /* + * Scale the elapsed time to reflect the real amount of + * computation + */ + delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); + + rq->clock_pelt += delta; +} + +/* + * When rq becomes idle, we have to check if it has lost idle time + * because it was fully busy. A rq is fully used when the /Sum util_sum + * is greater or equal to: + * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT; + * For optimization and computing rounding purpose, we don't take into account + * the position in the current window (period_contrib) and we use the higher + * bound of util_sum to decide. + */ +static inline void update_idle_rq_clock_pelt(struct rq *rq) +{ + u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX; + u32 util_sum = rq->cfs.avg.util_sum; + util_sum += rq->avg_rt.util_sum; + util_sum += rq->avg_dl.util_sum; + + /* + * Reflecting stolen time makes sense only if the idle + * phase would be present at max capacity. As soon as the + * utilization of a rq has reached the maximum value, it is + * considered as an always runnig rq without idle time to + * steal. This potential idle time is considered as lost in + * this case. We keep track of this lost idle time compare to + * rq's clock_task. + */ + if (util_sum >= divider) + rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; +} + +static inline u64 rq_clock_pelt(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + + return rq->clock_pelt - rq->lost_idle_time; +} + +#ifdef CONFIG_CFS_BANDWIDTH +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; +} +#else +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ + return rq_clock_pelt(rq_of(cfs_rq)); +} +#endif + #else static inline int @@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running) { return 0; } + +static inline u64 rq_clock_pelt(struct rq *rq) +{ + return rq_clock_task(rq); +} + +static inline void +update_rq_clock_pelt(struct rq *rq, s64 delta) { } + +static inline void +update_idle_rq_clock_pelt(struct rq *rq) { } + #endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c3484785b179..0e97ca9306ef 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -322,7 +322,7 @@ static bool update_stats(struct psi_group *group) expires = group->next_update; if (now < expires) goto out; - if (now - expires > psi_period) + if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); /* diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e4f398ad9e73..1e6b909dca36 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * rt task */ if (rq->curr->sched_class != &rt_sched_class) - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; } @@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); /* * The previous task needs to be made eligible for pushing @@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); watchdog(rq, p); @@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; if (rt_runtime_us < 0) rt_runtime = RUNTIME_INF; + else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } @@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) { u64 rt_runtime, rt_period; + if (rt_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + rt_period = rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d04530bf251f..b52ed1ada0be 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -780,7 +780,7 @@ struct root_domain { * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. */ - struct perf_domain *pd; + struct perf_domain __rcu *pd; }; extern struct root_domain def_root_domain; @@ -861,13 +861,16 @@ struct rq { unsigned int clock_update_flags; u64 clock; - u64 clock_task; + /* Ensure that all clocks are in the same cache line */ + u64 clock_task ____cacheline_aligned; + u64 clock_pelt; + unsigned long lost_idle_time; atomic_t nr_iowait; #ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; + struct root_domain *rd; + struct sched_domain __rcu *sd; unsigned long cpu_capacity; unsigned long cpu_capacity_orig; @@ -951,6 +954,22 @@ struct rq { #endif }; +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* CPU runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +#else + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} +#endif + static inline int cpu_of(struct rq *rq) { #ifdef CONFIG_SMP @@ -1260,7 +1279,7 @@ extern void sched_ttwu_pending(void); /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. + * See destroy_sched_domains: call_rcu for details. * * The domain tree of any CPU may only be accessed from within * preempt-disabled sections. @@ -1305,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) return sd; } -DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); -DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { @@ -1460,9 +1479,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); #ifdef CONFIG_THREAD_INFO_IN_TASK - p->cpu = cpu; + WRITE_ONCE(p->cpu, cpu); #else - task_thread_info(p)->cpu = cpu; + WRITE_ONCE(task_thread_info(p)->cpu, cpu); #endif p->wake_cpu = cpu; #endif @@ -1563,7 +1582,7 @@ static inline int task_on_rq_queued(struct task_struct *p) static inline int task_on_rq_migrating(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_MIGRATING; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; } /* @@ -1781,7 +1800,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -extern void post_init_entity_util_avg(struct sched_entity *se); +extern void post_init_entity_util_avg(struct task_struct *p); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); @@ -2166,7 +2185,7 @@ static inline u64 irq_time_read(int cpu) #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ -DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** * cpufreq_update_util - Take a note about CPU utilization changes. @@ -2211,6 +2230,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} # define arch_scale_freq_invariant() false #endif +#ifdef CONFIG_SMP +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} +#endif + #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL /** * enum schedutil_type - CPU utilization type @@ -2299,11 +2325,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned #endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) -#else + +DECLARE_STATIC_KEY_FALSE(sched_energy_present); + +static inline bool sched_energy_enabled(void) +{ + return static_branch_unlikely(&sched_energy_present); +} + +#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ + #define perf_domain_span(pd) NULL -#endif +static inline bool sched_energy_enabled(void) { return false; } -#ifdef CONFIG_SMP -extern struct static_key_false sched_energy_present; -#endif +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3f35ba1d8fde..f53f89df837d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -DEFINE_STATIC_KEY_FALSE(sched_energy_present); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +DEFINE_STATIC_KEY_FALSE(sched_energy_present); +unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +#ifdef CONFIG_PROC_SYSCTL +int sched_energy_aware_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, state; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + state = static_branch_unlikely(&sched_energy_present); + if (state != sysctl_sched_energy_aware) { + mutex_lock(&sched_energy_mutex); + sched_energy_update = 1; + rebuild_sched_domains(); + sched_energy_update = 0; + mutex_unlock(&sched_energy_mutex); + } + } + + return ret; +} +#endif + static void free_pd(struct perf_domain *pd) { struct perf_domain *tmp; @@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map) struct cpufreq_policy *policy; struct cpufreq_governor *gov; + if (!sysctl_sched_energy_aware) + goto free; + /* EAS is enabled for asymmetric CPU capacity topologies. */ if (!per_cpu(sd_asym_cpucapacity, cpu)) { if (sched_debug()) { @@ -442,7 +471,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) - call_rcu_sched(&old_rd->rcu, free_rootdomain); + call_rcu(&old_rd->rcu, free_rootdomain); } void sched_get_rd(struct root_domain *rd) @@ -455,7 +484,7 @@ void sched_put_rd(struct root_domain *rd) if (!atomic_dec_and_test(&rd->refcount)) return; - call_rcu_sched(&rd->rcu, free_rootdomain); + call_rcu(&rd->rcu, free_rootdomain); } static int init_rootdomain(struct root_domain *rd) @@ -586,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd) * the cpumask of the domain), this allows us to quickly tell if * two CPUs are in the same cache domain, see cpus_share_cache(). */ -DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); -DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) @@ -676,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } struct s_data { - struct sched_domain ** __percpu sd; + struct sched_domain * __percpu *sd; struct root_domain *rd; }; @@ -1030,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; struct sched_group *sg; + bool already_visited; if (child) cpu = cpumask_first(sched_domain_span(child)); @@ -1037,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg = *per_cpu_ptr(sdd->sg, cpu); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); - /* For claim_allocations: */ - atomic_inc(&sg->ref); - atomic_inc(&sg->sgc->ref); + /* Increase refcounts for claim_allocations: */ + already_visited = atomic_inc_return(&sg->ref) > 1; + /* sgc visits should follow a similar trend as sg */ + WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); + + /* If we have already visited that group, it's already initialized. */ + if (already_visited) + return sg; if (child) { cpumask_copy(sched_group_span(sg), sched_domain_span(child)); @@ -1058,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) /* * build_sched_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_capacity to 0. + * covered by the given span, will set each group's ->cpumask correctly, + * and will initialize their ->sgc. * * Assumes the sched_domain tree is fully constructed */ @@ -2046,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) } /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated CPUs, but could be used to - * exclude other special cases in the future. + * Set up scheduler domains and groups. For now this just excludes isolated + * CPUs, but could be used to exclude other special cases in the future. */ int sched_init_domains(const struct cpumask *cpu_map) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e815781ed751..a635ecba6fe2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -149,7 +149,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) sd->nr = syscall_get_nr(task, regs); sd->arch = syscall_get_arch(); - syscall_get_arguments(task, regs, 0, 6, args); + syscall_get_arguments(task, regs, args); sd->args[0] = args[0]; sd->args[1] = args[1]; sd->args[2] = args[2]; @@ -267,6 +267,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ + preempt_disable(); for (; f; f = f->prev) { u32 cur_ret = BPF_PROG_RUN(f->prog, sd); @@ -275,6 +276,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, *match = f; } } + preempt_enable(); return ret; } #endif /* CONFIG_SECCOMP_FILTER */ @@ -329,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent, * Expects sighand and cred_guard_mutex locks to be held. * * Returns 0 on success, -ve on error, or the pid of a thread which was - * either not in the correct seccomp mode or it did not have an ancestral + * either not in the correct seccomp mode or did not have an ancestral * seccomp filter. */ static inline pid_t seccomp_can_sync_threads(void) @@ -443,8 +445,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) * behavior of privileged children. */ if (!task_no_new_privs(current) && - security_capable_noaudit(current_cred(), current_user_ns(), - CAP_SYS_ADMIN) != 0) + security_capable(current_cred(), current_user_ns(), + CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ @@ -500,7 +502,10 @@ out: * * Caller must be holding current->sighand->siglock lock. * - * Returns 0 on success, -ve on error. + * Returns 0 on success, -ve on error, or + * - in TSYNC mode: the pid of a thread which was either not in the correct + * seccomp mode or did not have an ancestral seccomp filter + * - in NEW_LISTENER mode: the fd of the new listener */ static long seccomp_attach_filter(unsigned int flags, struct seccomp_filter *filter) @@ -1256,6 +1261,16 @@ static long seccomp_set_mode_filter(unsigned int flags, if (flags & ~SECCOMP_FILTER_FLAG_MASK) return -EINVAL; + /* + * In the successful case, NEW_LISTENER returns the new listener fd. + * But in the failure case, TSYNC returns the thread that died. If you + * combine these two flags, there's no way to tell whether something + * succeeded or failed. So, let's disallow this combination. + */ + if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && + (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER)) + return -EINVAL; + /* Prepare the new filter before holding any locks. */ prepared = seccomp_prepare_user_filter(filter); if (IS_ERR(prepared)) @@ -1302,7 +1317,7 @@ out: mutex_unlock(¤t->signal->cred_guard_mutex); out_put_fd: if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { - if (ret < 0) { + if (ret) { listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); diff --git a/kernel/signal.c b/kernel/signal.c index 99fa8ff06fd9..cd83cc376767 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -19,7 +19,9 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/file.h> #include <linux/fs.h> +#include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/coredump.h> @@ -2436,9 +2438,12 @@ relock: } /* Has this task already been marked for death? */ - ksig->info.si_signo = signr = SIGKILL; - if (signal_group_exit(signal)) + if (signal_group_exit(signal)) { + ksig->info.si_signo = signr = SIGKILL; + sigdelset(¤t->pending.signal, SIGKILL); + recalc_sigpending(); goto fatal; + } for (;;) { struct k_sigaction *ka; @@ -3452,7 +3457,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { @@ -3484,6 +3489,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, #endif #endif +static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info) +{ + clear_siginfo(info); + info->si_signo = sig; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_tgid_vnr(current); + info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); +} + /** * sys_kill - send a signal to a process * @pid: the PID of the process @@ -3493,16 +3508,126 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; - clear_siginfo(&info); - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = task_tgid_vnr(current); - info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + prepare_kill_siginfo(sig, &info); return kill_something_info(sig, &info, pid); } +/* + * Verify that the signaler and signalee either are in the same pid namespace + * or that the signaler's pid namespace is an ancestor of the signalee's pid + * namespace. + */ +static bool access_pidfd_pidns(struct pid *pid) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *p = ns_of_pid(pid); + + for (;;) { + if (!p) + return false; + if (p == active) + break; + p = p->parent; + } + + return true; +} + +static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) +{ +#ifdef CONFIG_COMPAT + /* + * Avoid hooking up compat syscalls and instead handle necessary + * conversions here. Note, this is a stop-gap measure and should not be + * considered a generic solution. + */ + if (in_compat_syscall()) + return copy_siginfo_from_user32( + kinfo, (struct compat_siginfo __user *)info); +#endif + return copy_siginfo_from_user(kinfo, info); +} + +static struct pid *pidfd_to_pid(const struct file *file) +{ + if (file->f_op == &pidfd_fops) + return file->private_data; + + return tgid_pidfd_to_pid(file); +} + +/** + * sys_pidfd_send_signal - send a signal to a process through a task file + * descriptor + * @pidfd: the file descriptor of the process + * @sig: signal to be sent + * @info: the signal info + * @flags: future flags to be passed + * + * The syscall currently only signals via PIDTYPE_PID which covers + * kill(<positive-pid>, <signal>. It does not signal threads or process + * groups. + * In order to extend the syscall to threads and process groups the @flags + * argument should be used. In essence, the @flags argument will determine + * what is signaled and not the file descriptor itself. Put in other words, + * grouping is a property of the flags argument not a property of the file + * descriptor. + * + * Return: 0 on success, negative errno on failure + */ +SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, + siginfo_t __user *, info, unsigned int, flags) +{ + int ret; + struct fd f; + struct pid *pid; + kernel_siginfo_t kinfo; + + /* Enforce flags be set to 0 until we add an extension. */ + if (flags) + return -EINVAL; + + f = fdget(pidfd); + if (!f.file) + return -EBADF; + + /* Is this a pidfd? */ + pid = pidfd_to_pid(f.file); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto err; + } + + ret = -EINVAL; + if (!access_pidfd_pidns(pid)) + goto err; + + if (info) { + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + goto err; + + ret = -EINVAL; + if (unlikely(sig != kinfo.si_signo)) + goto err; + + /* Only allow sending arbitrary signals to yourself. */ + ret = -EPERM; + if ((task_pid(current) != pid) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) + goto err; + } else { + prepare_kill_siginfo(sig, &kinfo); + } + + ret = kill_pid_info(sig, &kinfo, pid); + +err: + fdput(f); + return ret; +} + static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { diff --git a/kernel/softirq.c b/kernel/softirq.c index d28813306b2c..2c3382378d94 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -89,7 +89,8 @@ static bool ksoftirqd_running(unsigned long pending) if (pending & SOFTIRQ_NOW_MASK) return false; - return tsk && (tsk->state == TASK_RUNNING); + return tsk && (tsk->state == TASK_RUNNING) && + !__kthread_should_park(tsk); } /* @@ -572,57 +573,6 @@ void tasklet_kill(struct tasklet_struct *t) } EXPORT_SYMBOL(tasklet_kill); -/* - * tasklet_hrtimer - */ - -/* - * The trampoline is called when the hrtimer expires. It schedules a tasklet - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended - * hrtimer callback, but from softirq context. - */ -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) -{ - struct tasklet_hrtimer *ttimer = - container_of(timer, struct tasklet_hrtimer, timer); - - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; -} - -/* - * Helper function which calls the hrtimer callback from - * tasklet/softirq context - */ -static void __tasklet_hrtimer_trampoline(unsigned long data) -{ - struct tasklet_hrtimer *ttimer = (void *)data; - enum hrtimer_restart restart; - - restart = ttimer->function(&ttimer->timer); - if (restart != HRTIMER_NORESTART) - hrtimer_restart(&ttimer->timer); -} - -/** - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks - * @ttimer: tasklet_hrtimer which is initialized - * @function: hrtimer callback function which gets called from softirq context - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) - * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) - */ -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t which_clock, enum hrtimer_mode mode) -{ - hrtimer_init(&ttimer->timer, which_clock, mode); - ttimer->timer.function = __hrtimer_tasklet_trampoline; - tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, - (unsigned long)ttimer); - ttimer->function = function; -} -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); - void __init softirq_init(void) { int cpu; diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f8edee9c792d..27bafc1e271e 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -5,41 +5,56 @@ * * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> */ +#include <linux/sched/task_stack.h> +#include <linux/sched/debug.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/kallsyms.h> #include <linux/stacktrace.h> -void print_stack_trace(struct stack_trace *trace, int spaces) +/** + * stack_trace_print - Print the entries in the stack trace + * @entries: Pointer to storage array + * @nr_entries: Number of entries in the storage array + * @spaces: Number of leading spaces to print + */ +void stack_trace_print(unsigned long *entries, unsigned int nr_entries, + int spaces) { - int i; + unsigned int i; - if (WARN_ON(!trace->entries)) + if (WARN_ON(!entries)) return; - for (i = 0; i < trace->nr_entries; i++) - printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]); + for (i = 0; i < nr_entries; i++) + printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]); } -EXPORT_SYMBOL_GPL(print_stack_trace); +EXPORT_SYMBOL_GPL(stack_trace_print); -int snprint_stack_trace(char *buf, size_t size, - struct stack_trace *trace, int spaces) +/** + * stack_trace_snprint - Print the entries in the stack trace into a buffer + * @buf: Pointer to the print buffer + * @size: Size of the print buffer + * @entries: Pointer to storage array + * @nr_entries: Number of entries in the storage array + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed. + */ +int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, + unsigned int nr_entries, int spaces) { - int i; - int generated; - int total = 0; + unsigned int generated, i, total = 0; - if (WARN_ON(!trace->entries)) + if (WARN_ON(!entries)) return 0; - for (i = 0; i < trace->nr_entries; i++) { + for (i = 0; i < nr_entries && size; i++) { generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ', - (void *)trace->entries[i]); + (void *)entries[i]); total += generated; - - /* Assume that generated isn't a negative number */ if (generated >= size) { buf += size; size = 0; @@ -51,7 +66,176 @@ int snprint_stack_trace(char *buf, size_t size, return total; } -EXPORT_SYMBOL_GPL(snprint_stack_trace); +EXPORT_SYMBOL_GPL(stack_trace_snprint); + +#ifdef CONFIG_ARCH_STACKWALK + +struct stacktrace_cookie { + unsigned long *store; + unsigned int size; + unsigned int skip; + unsigned int len; +}; + +static bool stack_trace_consume_entry(void *cookie, unsigned long addr, + bool reliable) +{ + struct stacktrace_cookie *c = cookie; + + if (c->len >= c->size) + return false; + + if (c->skip > 0) { + c->skip--; + return true; + } + c->store[c->len++] = addr; + return c->len < c->size; +} + +static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr, + bool reliable) +{ + if (in_sched_functions(addr)) + return true; + return stack_trace_consume_entry(cookie, addr, reliable); +} + +/** + * stack_trace_save - Save a stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save(unsigned long *store, unsigned int size, + unsigned int skipnr) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + .skip = skipnr + 1, + }; + + arch_stack_walk(consume_entry, &c, current, NULL); + return c.len; +} +EXPORT_SYMBOL_GPL(stack_trace_save); + +/** + * stack_trace_save_tsk - Save a task stack trace into a storage array + * @task: The task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, + unsigned int size, unsigned int skipnr) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched; + struct stacktrace_cookie c = { + .store = store, + .size = size, + .skip = skipnr + 1, + }; + + if (!try_get_task_stack(tsk)) + return 0; + + arch_stack_walk(consume_entry, &c, tsk, NULL); + put_task_stack(tsk); + return c.len; +} + +/** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array + * @regs: Pointer to pt_regs to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, + unsigned int size, unsigned int skipnr) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + .skip = skipnr, + }; + + arch_stack_walk(consume_entry, &c, current, regs); + return c.len; +} + +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/** + * stack_trace_save_tsk_reliable - Save task stack with verification + * @tsk: Pointer to the task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: An error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is + * reliable and returns the number of entries stored. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, + unsigned int size) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + }; + int ret; + + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ + if (!try_get_task_stack(tsk)) + return 0; + + ret = arch_stack_walk_reliable(consume_entry, &c, tsk); + put_task_stack(tsk); + return ret; +} +#endif + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +/** + * stack_trace_save_user - Save a user space stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + }; + + /* Trace user stack if not a kernel thread */ + if (!current->mm) + return 0; + + arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); + return c.len; +} +#endif + +#else /* CONFIG_ARCH_STACKWALK */ /* * Architectures that do not implement save_stack_trace_*() @@ -77,3 +261,118 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); return -ENOSYS; } + +/** + * stack_trace_save - Save a stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save(unsigned long *store, unsigned int size, + unsigned int skipnr) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + .skip = skipnr + 1, + }; + + save_stack_trace(&trace); + return trace.nr_entries; +} +EXPORT_SYMBOL_GPL(stack_trace_save); + +/** + * stack_trace_save_tsk - Save a task stack trace into a storage array + * @task: The task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_tsk(struct task_struct *task, + unsigned long *store, unsigned int size, + unsigned int skipnr) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + .skip = skipnr + 1, + }; + + save_stack_trace_tsk(task, &trace); + return trace.nr_entries; +} + +/** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array + * @regs: Pointer to pt_regs to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, + unsigned int size, unsigned int skipnr) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + .skip = skipnr, + }; + + save_stack_trace_regs(regs, &trace); + return trace.nr_entries; +} + +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/** + * stack_trace_save_tsk_reliable - Save task stack with verification + * @tsk: Pointer to the task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: An error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is + * reliable and returns the number of entries stored. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, + unsigned int size) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + }; + int ret = save_stack_trace_tsk_reliable(tsk, &trace); + + return ret ? ret : trace.nr_entries; +} +#endif + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +/** + * stack_trace_save_user - Save a user space stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + }; + + save_stack_trace_user(&trace); + return trace.nr_entries; +} +#endif /* CONFIG_USER_STACKTRACE_SUPPORT */ + +#endif /* !CONFIG_ARCH_STACKWALK */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 067cb83f37ea..7231fb5953fc 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -513,7 +513,7 @@ repeat: } preempt_count_dec(); WARN_ONCE(preempt_count(), - "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); + "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; } } diff --git a/kernel/sys.c b/kernel/sys.c index f7eb62eceb24..12df0e5434b8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -516,7 +516,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) new->uid = kruid; if (!uid_eq(old->uid, kruid) && !uid_eq(old->euid, kruid) && - !ns_capable(old->user_ns, CAP_SETUID)) + !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } @@ -525,7 +525,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (!uid_eq(old->uid, keuid) && !uid_eq(old->euid, keuid) && !uid_eq(old->suid, keuid) && - !ns_capable(old->user_ns, CAP_SETUID)) + !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } @@ -584,7 +584,7 @@ long __sys_setuid(uid_t uid) old = current_cred(); retval = -EPERM; - if (ns_capable(old->user_ns, CAP_SETUID)) { + if (ns_capable_setid(old->user_ns, CAP_SETUID)) { new->suid = new->uid = kuid; if (!uid_eq(kuid, old->uid)) { retval = set_user(new); @@ -646,7 +646,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) old = current_cred(); retval = -EPERM; - if (!ns_capable(old->user_ns, CAP_SETUID)) { + if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) goto error; @@ -814,7 +814,7 @@ long __sys_setfsuid(uid_t uid) if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || - ns_capable(old->user_ns, CAP_SETUID)) { + ns_capable_setid(old->user_ns, CAP_SETUID)) { if (!uid_eq(kuid, old->fsuid)) { new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -1747,6 +1747,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) if (who == RUSAGE_CHILDREN) break; + /* fall through */ case RUSAGE_SELF: thread_group_cputime_adjusted(p, &tgutime, &tgstime); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ab9d0e3c6d50..4d9ae5ea6caf 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -42,10 +42,15 @@ COND_SYSCALL(io_destroy); COND_SYSCALL(io_submit); COND_SYSCALL_COMPAT(io_submit); COND_SYSCALL(io_cancel); +COND_SYSCALL(io_getevents_time32); COND_SYSCALL(io_getevents); +COND_SYSCALL(io_pgetevents_time32); COND_SYSCALL(io_pgetevents); -COND_SYSCALL_COMPAT(io_getevents); +COND_SYSCALL_COMPAT(io_pgetevents_time32); COND_SYSCALL_COMPAT(io_pgetevents); +COND_SYSCALL(io_uring_setup); +COND_SYSCALL(io_uring_enter); +COND_SYSCALL(io_uring_register); /* fs/xattr.c */ @@ -114,9 +119,9 @@ COND_SYSCALL_COMPAT(signalfd4); /* fs/timerfd.c */ COND_SYSCALL(timerfd_create); COND_SYSCALL(timerfd_settime); -COND_SYSCALL_COMPAT(timerfd_settime); +COND_SYSCALL(timerfd_settime32); COND_SYSCALL(timerfd_gettime); -COND_SYSCALL_COMPAT(timerfd_gettime); +COND_SYSCALL(timerfd_gettime32); /* fs/utimes.c */ @@ -135,7 +140,7 @@ COND_SYSCALL(capset); /* kernel/futex.c */ COND_SYSCALL(futex); -COND_SYSCALL_COMPAT(futex); +COND_SYSCALL(futex_time32); COND_SYSCALL(set_robust_list); COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); @@ -162,8 +167,6 @@ COND_SYSCALL(syslog); /* kernel/sched/core.c */ -/* kernel/signal.c */ - /* kernel/sys.c */ COND_SYSCALL(setregid); COND_SYSCALL(setgid); @@ -187,9 +190,9 @@ COND_SYSCALL(mq_open); COND_SYSCALL_COMPAT(mq_open); COND_SYSCALL(mq_unlink); COND_SYSCALL(mq_timedsend); -COND_SYSCALL_COMPAT(mq_timedsend); +COND_SYSCALL(mq_timedsend_time32); COND_SYSCALL(mq_timedreceive); -COND_SYSCALL_COMPAT(mq_timedreceive); +COND_SYSCALL(mq_timedreceive_time32); COND_SYSCALL(mq_notify); COND_SYSCALL_COMPAT(mq_notify); COND_SYSCALL(mq_getsetattr); @@ -197,8 +200,10 @@ COND_SYSCALL_COMPAT(mq_getsetattr); /* ipc/msg.c */ COND_SYSCALL(msgget); +COND_SYSCALL(old_msgctl); COND_SYSCALL(msgctl); COND_SYSCALL_COMPAT(msgctl); +COND_SYSCALL_COMPAT(old_msgctl); COND_SYSCALL(msgrcv); COND_SYSCALL_COMPAT(msgrcv); COND_SYSCALL(msgsnd); @@ -206,16 +211,20 @@ COND_SYSCALL_COMPAT(msgsnd); /* ipc/sem.c */ COND_SYSCALL(semget); +COND_SYSCALL(old_semctl); COND_SYSCALL(semctl); COND_SYSCALL_COMPAT(semctl); +COND_SYSCALL_COMPAT(old_semctl); COND_SYSCALL(semtimedop); -COND_SYSCALL_COMPAT(semtimedop); +COND_SYSCALL(semtimedop_time32); COND_SYSCALL(semop); /* ipc/shm.c */ COND_SYSCALL(shmget); +COND_SYSCALL(old_shmctl); COND_SYSCALL(shmctl); COND_SYSCALL_COMPAT(shmctl); +COND_SYSCALL_COMPAT(old_shmctl); COND_SYSCALL(shmat); COND_SYSCALL_COMPAT(shmat); COND_SYSCALL(shmdt); @@ -285,7 +294,7 @@ COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); COND_SYSCALL(recvmmsg); COND_SYSCALL(recvmmsg_time32); -COND_SYSCALL_COMPAT(recvmmsg); +COND_SYSCALL_COMPAT(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg_time64); /* @@ -366,6 +375,7 @@ COND_SYSCALL(kexec_file_load); /* s390 */ COND_SYSCALL(s390_pci_mmio_read); COND_SYSCALL(s390_pci_mmio_write); +COND_SYSCALL(s390_ipc); COND_SYSCALL_COMPAT(s390_ipc); /* powerpc */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba4d9e85feb8..c9ec050bcf46 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,6 +67,8 @@ #include <linux/bpf.h> #include <linux/mount.h> +#include "../lib/kstrtox.h" + #include <linux/uaccess.h> #include <asm/processor.h> @@ -126,7 +128,9 @@ static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; +static unsigned long zero_ul; static unsigned long one_ul = 1; +static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int one_thousand = 1000; #ifdef CONFIG_PRINTK @@ -224,6 +228,11 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #endif static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_BPF_SYSCALL +static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses its own private copy */ @@ -467,6 +476,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -1229,6 +1249,15 @@ static struct ctl_table kern_table[] = { .extra1 = &one, .extra2 = &one, }, + { + .procname = "bpf_stats_enabled", + .data = &sysctl_bpf_stats_enabled, + .maxlen = sizeof(sysctl_bpf_stats_enabled), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_bpf_stats, + .extra1 = &zero, + .extra2 = &one, + }, #endif #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) { @@ -1446,7 +1475,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = sysctl_extfrag_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, @@ -1722,6 +1751,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &long_max, }, { .procname = "nr_open", @@ -2092,6 +2123,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v) } } +/** + * strtoul_lenient - parse an ASCII formatted integer from a buffer and only + * fail on overflow + * + * @cp: kernel buffer containing the string to parse + * @endp: pointer to store the trailing characters + * @base: the base to use + * @res: where the parsed integer will be stored + * + * In case of success 0 is returned and @res will contain the parsed integer, + * @endp will hold any trailing characters. + * This function will fail the parse on overflow. If there wasn't an overflow + * the function will defer the decision what characters count as invalid to the + * caller. + */ +static int strtoul_lenient(const char *cp, char **endp, unsigned int base, + unsigned long *res) +{ + unsigned long long result; + unsigned int rv; + + cp = _parse_integer_fixup_radix(cp, &base); + rv = _parse_integer(cp, base, &result); + if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) + return -ERANGE; + + cp += rv; + + if (endp) + *endp = (char *)cp; + + *res = (unsigned long)result; + return 0; +} + #define TMPBUFLEN 22 /** * proc_get_long - reads an ASCII formatted integer from a user buffer @@ -2135,7 +2201,8 @@ static int proc_get_long(char **buf, size_t *size, if (!isdigit(*p)) return -EINVAL; - *val = simple_strtoul(p, &p, 0); + if (strtoul_lenient(p, &p, 0, val)) + return -EINVAL; len = p - tmp; @@ -2577,23 +2644,25 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { + int tmp, ret; struct do_proc_dointvec_minmax_conv_param *param = data; + /* + * If writing, first do so via a temporary local int so we can + * bounds-check it before touching *valp. + */ + int *ip = write ? &tmp : valp; + + ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); + if (ret) + return ret; + if (write) { - int val = *negp ? -*lvalp : *lvalp; - if ((param->min && *param->min > val) || - (param->max && *param->max < val)) + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) return -EINVAL; - *valp = val; - } else { - int val = *valp; - if (val < 0) { - *negp = true; - *lvalp = -(unsigned long)val; - } else { - *negp = false; - *lvalp = (unsigned long)val; - } + *valp = tmp; } + return 0; } @@ -2642,22 +2711,22 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { + int ret; + unsigned int tmp; struct do_proc_douintvec_minmax_conv_param *param = data; + /* write via temporary local uint for bounds-checking */ + unsigned int *up = write ? &tmp : valp; - if (write) { - unsigned int val = *lvalp; + ret = do_proc_douintvec_conv(lvalp, up, write, data); + if (ret) + return ret; - if (*lvalp > UINT_MAX) - return -EINVAL; - - if ((param->min && *param->min > val) || - (param->max && *param->max < val)) + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) return -ERANGE; - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; + *valp = tmp; } return 0; @@ -3260,6 +3329,29 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, #endif /* CONFIG_PROC_SYSCTL */ +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) +static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, bpf_stats = *(int *)table->data; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &bpf_stats; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + *(int *)table->data = bpf_stats; + if (bpf_stats) + static_branch_enable(&bpf_stats_enabled_key); + else + static_branch_disable(&bpf_stats_enabled_key); + } + return ret; +} +#endif /* * No sense putting this after each symbol definition, twice, * exception granted :-) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 58b981f4bb5d..e2c038d6c13c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -117,6 +117,35 @@ config NO_HZ_FULL endchoice +config CONTEXT_TRACKING + bool + +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the development of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2c97e8c2d29f..0519a8805aab 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -594,7 +594,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; - return ktime_sub(now, alarm->node.expires); + return ktime_sub(alarm->node.expires, now); } /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5e77662dd2d9..f5490222e134 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -611,6 +611,22 @@ void clockevents_resume(void) } #ifdef CONFIG_HOTPLUG_CPU + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +/** + * tick_offline_cpu - Take CPU out of the broadcast mechanism + * @cpu: The outgoing CPU + * + * Called on the outgoing CPU after it took itself offline. + */ +void tick_offline_cpu(unsigned int cpu) +{ + raw_spin_lock(&clockevents_lock); + tick_broadcast_offline(cpu); + raw_spin_unlock(&clockevents_lock); +} +# endif + /** * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu */ @@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu) raw_spin_lock_irqsave(&clockevents_lock, flags); - tick_shutdown_broadcast_oneshot(cpu); - tick_shutdown_broadcast(cpu); tick_shutdown(cpu); /* * Unregister the clock event devices which were diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f5cfa1b73d6f..41dfff23c1f9 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -364,7 +364,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); - + /* fall through */ default: return false; } @@ -1771,7 +1771,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp, +SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index dc1b6f1929f9..d23b434c2ca7 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { - unsigned long seq; + unsigned int seq; u64 ret; do { @@ -89,7 +89,7 @@ struct clocksource * __init __weak clocksource_default_clock(void) return &clocksource_jiffies; } -struct clocksource refined_jiffies; +static struct clocksource refined_jiffies; int register_refined_jiffies(long cycles_per_second) { diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 36a2bef00125..92a90014a925 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -188,13 +188,13 @@ static inline int is_error_status(int status) && (status & (STA_PPSWANDER|STA_PPSERROR))); } -static inline void pps_fill_timex(struct timex *txc) +static inline void pps_fill_timex(struct __kernel_timex *txc) { txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->jitter = pps_jitter; if (!(time_status & STA_NANO)) - txc->jitter /= NSEC_PER_USEC; + txc->jitter = pps_jitter / NSEC_PER_USEC; txc->shift = pps_shift; txc->stabil = pps_stabil; txc->jitcnt = pps_jitcnt; @@ -220,7 +220,7 @@ static inline int is_error_status(int status) return status & (STA_UNSYNC|STA_CLOCKERR); } -static inline void pps_fill_timex(struct timex *txc) +static inline void pps_fill_timex(struct __kernel_timex *txc) { /* PPS is not implemented, so these are zero */ txc->ppsfreq = 0; @@ -633,7 +633,7 @@ void ntp_notify_cmos_timer(void) /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(const struct timex *txc) +static inline void process_adj_status(const struct __kernel_timex *txc) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; @@ -656,7 +656,8 @@ static inline void process_adj_status(const struct timex *txc) } -static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai) +static inline void process_adjtimex_modes(const struct __kernel_timex *txc, + s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc); @@ -707,7 +708,8 @@ static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) +int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai) { int result; @@ -729,7 +731,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; + txc->offset = (u32)txc->offset / NSEC_PER_USEC; } result = time_state; /* mostly `TIME_OK' */ @@ -754,7 +756,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) txc->time.tv_sec = (time_t)ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) - txc->time.tv_usec /= NSEC_PER_USEC; + txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; /* Handle leapsec adjustments */ if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index c24b0e13f011..40e6122e634e 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,6 +8,6 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai); +extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 425bbfce6819..ec960bb939fd 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -228,7 +228,7 @@ static void put_clock_desc(struct posix_clock_desc *cd) fput(cd->fp); } -static int pc_clock_adjtime(clockid_t id, struct timex *tx) +static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) { struct posix_clock_desc cd; int err; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 80f955210861..0a426f4e3125 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -67,13 +67,13 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) int i; u64 delta, incr; - if (timer->it.cpu.incr == 0) + if (!timer->it_interval) return; if (now < timer->it.cpu.expires) return; - incr = timer->it.cpu.incr; + incr = timer->it_interval; delta = now + incr - timer->it.cpu.expires; /* Don't use (incr*2 < delta), incr*2 might overflow. */ @@ -520,7 +520,7 @@ static void cpu_timer_fire(struct k_itimer *timer) */ wake_up_process(timer->it_process); timer->it.cpu.expires = 0; - } else if (timer->it.cpu.incr == 0) { + } else if (!timer->it_interval) { /* * One-shot timer. Clear it as soon as it's fired. */ @@ -606,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, */ ret = 0; - old_incr = timer->it.cpu.incr; + old_incr = timer->it_interval; old_expires = timer->it.cpu.expires; if (unlikely(timer->it.cpu.firing)) { timer->it.cpu.firing = -1; @@ -684,8 +684,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * Install the new reload setting, and * set up the signal and overrun bookkeeping. */ - timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); - timer->it_interval = ns_to_ktime(timer->it.cpu.incr); + timer->it_interval = timespec64_to_ktime(new->it_interval); /* * This acts as a modification timestamp for the timer, @@ -724,7 +723,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp /* * Easy part: convert the reload time. */ - itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); + itp->it_interval = ktime_to_timespec64(timer->it_interval); if (!timer->it.cpu.expires) return; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index a51895486e5e..67df65f887ac 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -45,6 +45,7 @@ SYS_NI(timer_delete); SYS_NI(clock_adjtime); SYS_NI(getitimer); SYS_NI(setitimer); +SYS_NI(clock_adjtime32); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif @@ -150,16 +151,16 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT COMPAT_SYS_NI(timer_create); -COMPAT_SYS_NI(clock_adjtime); -COMPAT_SYS_NI(timer_settime); -COMPAT_SYS_NI(timer_gettime); COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); #endif #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYS_NI(timer_settime32); +SYS_NI(timer_gettime32); + +SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock, + struct old_timespec32 __user *, tp) { struct timespec64 new_tp; @@ -171,8 +172,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, return do_sys_settimeofday64(&new_tp, NULL); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -186,8 +187,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return 0; } -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -206,9 +207,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, } } -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 t; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0e84bb72a3da..29176635991f 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -179,7 +179,7 @@ static int posix_clock_realtime_set(const clockid_t which_clock, } static int posix_clock_realtime_adj(const clockid_t which_clock, - struct timex *t) + struct __kernel_timex *t) { return do_adjtimex(t); } @@ -730,8 +730,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct old_itimerspec32 __user *, setting) +SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, + struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; @@ -903,9 +903,9 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct old_itimerspec32 __user *, new, - struct old_itimerspec32 __user *, old) +SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags, + struct old_itimerspec32 __user *, new, + struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; @@ -1047,22 +1047,28 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, return error; } -SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, - struct timex __user *, utx) +int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timex ktx; - int err; if (!kc) return -EINVAL; if (!kc->clock_adj) return -EOPNOTSUPP; + return kc->clock_adj(which_clock, ktx); +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct __kernel_timex __user *, utx) +{ + struct __kernel_timex ktx; + int err; + if (copy_from_user(&ktx, utx, sizeof(ktx))) return -EFAULT; - err = kc->clock_adj(which_clock, &ktx); + err = do_clock_adjtime(which_clock, &ktx); if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) return -EFAULT; @@ -1090,8 +1096,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1105,8 +1111,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, return kc->clock_set(which_clock, &ts); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1123,40 +1129,26 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return err; } -#endif - -#ifdef CONFIG_COMPAT - -COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, - struct compat_timex __user *, utp) +SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock, + struct old_timex32 __user *, utp) { - const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timex ktx; + struct __kernel_timex ktx; int err; - if (!kc) - return -EINVAL; - if (!kc->clock_adj) - return -EOPNOTSUPP; - - err = compat_get_timex(&ktx, utp); + err = get_old_timex32(&ktx, utp); if (err) return err; - err = kc->clock_adj(which_clock, &ktx); + err = do_clock_adjtime(which_clock, &ktx); if (err >= 0) - err = compat_put_timex(utp, &ktx); + err = put_old_timex32(utp, &ktx); return err; } -#endif - -#ifdef CONFIG_COMPAT_32BIT_TIME - -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1212,9 +1204,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index ddb21145211a..de5daa6d975a 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -8,7 +8,7 @@ struct k_clock { const struct timespec64 *tp); int (*clock_get)(const clockid_t which_clock, struct timespec64 *tp); - int (*clock_adj)(const clockid_t which_clock, struct timex *tx); + int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, const struct timespec64 *); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 094b82ca95e5..142b07619918 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) unsigned long long notrace sched_clock(void) { u64 cyc, res; - unsigned long seq; + unsigned int seq; struct clock_read_data *rd; do { @@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); - pr_debug("Registered %pF as sched_clock source\n", read); + pr_debug("Registered %pS as sched_clock source\n", read); } void __init generic_sched_clock_init(void) @@ -267,12 +267,12 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned long seq = raw_read_seqcount(&cd.seq); + unsigned int seq = raw_read_seqcount(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } -static int sched_clock_suspend(void) +int sched_clock_suspend(void) { struct clock_read_data *rd = &cd.read_data[0]; @@ -283,7 +283,7 @@ static int sched_clock_suspend(void) return 0; } -static void sched_clock_resume(void) +void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 803fa67aace9..e51778c312f1 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -36,10 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); +# ifdef CONFIG_HOTPLUG_CPU +static void tick_broadcast_oneshot_offline(unsigned int cpu); +# endif #else static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } +# ifdef CONFIG_HOTPLUG_CPU +static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } +# endif #endif /* @@ -375,6 +381,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) switch (mode) { case TICK_BROADCAST_FORCE: tick_broadcast_forced = 1; + /* fall through */ case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { @@ -432,27 +439,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) } #ifdef CONFIG_HOTPLUG_CPU -/* - * Remove a CPU from broadcasting - */ -void tick_shutdown_broadcast(unsigned int cpu) +static void tick_shutdown_broadcast(void) { - struct clock_event_device *bc; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - bc = tick_broadcast_device.evtdev; - cpumask_clear_cpu(cpu, tick_broadcast_mask); - cpumask_clear_cpu(cpu, tick_broadcast_on); + struct clock_event_device *bc = tick_broadcast_device.evtdev; if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } +} - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +/* + * Remove a CPU from broadcasting + */ +void tick_broadcast_offline(unsigned int cpu) +{ + raw_spin_lock(&tick_broadcast_lock); + cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); + tick_broadcast_oneshot_offline(cpu); + tick_shutdown_broadcast(); + raw_spin_unlock(&tick_broadcast_lock); } + #endif void tick_suspend_broadcast(void) @@ -800,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) * either the CPU handling the broadcast * interrupt or we got woken by something else. * - * We are not longer in the broadcast mask, so + * We are no longer in the broadcast mask, so * if the cpu local expiry time is already * reached, we would reprogram the cpu local * timer with an already expired event. * * This can lead to a ping-pong when we return - * to idle and therefor rearm the broadcast + * to idle and therefore rearm the broadcast * timer before the cpu local timer was able * to fire. This happens because the forced * reprogramming makes sure that the event @@ -949,14 +958,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) } /* - * Remove a dead CPU from broadcasting + * Remove a dying CPU from broadcasting */ -void tick_shutdown_broadcast_oneshot(unsigned int cpu) +static void tick_broadcast_oneshot_offline(unsigned int cpu) { - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - /* * Clear the broadcast masks for the dead cpu, but do not stop * the broadcast device! @@ -964,8 +969,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu) cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); - - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 529143b4c8d2..59225b484e4e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -46,6 +46,14 @@ ktime_t tick_period; * procedure also covers cpu hotplug. */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; +#ifdef CONFIG_NO_HZ_FULL +/* + * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns + * tick_do_timer_cpu and it should be taken over by an eligible secondary + * when one comes online. + */ +static int tick_do_timer_boot_cpu __read_mostly = -1; +#endif /* * Debugging: see timer_list.c @@ -149,7 +157,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) !tick_broadcast_oneshot_active()) { clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { - unsigned long seq; + unsigned int seq; ktime_t next; do { @@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) } } +#ifdef CONFIG_NO_HZ_FULL +static void giveup_do_timer(void *info) +{ + int cpu = *(unsigned int *)info; + + WARN_ON(tick_do_timer_cpu != smp_processor_id()); + + tick_do_timer_cpu = cpu; +} + +static void tick_take_do_timer_from_boot(void) +{ + int cpu = smp_processor_id(); + int from = tick_do_timer_boot_cpu; + + if (from >= 0 && from != cpu) + smp_call_function_single(from, giveup_do_timer, &cpu, 1); +} +#endif + /* * Setup the tick device */ @@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - if (!tick_nohz_full_cpu(cpu)) - tick_do_timer_cpu = cpu; - else - tick_do_timer_cpu = TICK_DO_TIMER_NONE; + tick_do_timer_cpu = cpu; + tick_next_period = ktime_get(); tick_period = NSEC_PER_SEC / HZ; +#ifdef CONFIG_NO_HZ_FULL + /* + * The boot CPU may be nohz_full, in which case set + * tick_do_timer_boot_cpu so the first housekeeping + * secondary that comes up will take do_timer from + * us. + */ + if (tick_nohz_full_cpu(cpu)) + tick_do_timer_boot_cpu = cpu; + + } else if (tick_do_timer_boot_cpu != -1 && + !tick_nohz_full_cpu(cpu)) { + tick_take_do_timer_from_boot(); + tick_do_timer_boot_cpu = -1; + WARN_ON(tick_do_timer_cpu != cpu); +#endif } /* @@ -487,6 +529,7 @@ void tick_freeze(void) trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); system_state = SYSTEM_SUSPEND; + sched_clock_suspend(); timekeeping_suspend(); } else { tick_suspend_local(); @@ -510,6 +553,7 @@ void tick_unfreeze(void) if (tick_freeze_depth == num_online_cpus()) { timekeeping_resume(); + sched_clock_resume(); system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..7b2496136729 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_shutdown_broadcast(unsigned int cpu); extern void tick_suspend_broadcast(void); extern void tick_resume_broadcast(void); extern bool tick_resume_check_broadcast(void); @@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev) static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_shutdown_broadcast(unsigned int cpu) { } static inline void tick_suspend_broadcast(void) { } static inline void tick_resume_broadcast(void) { } static inline bool tick_resume_check_broadcast(void) { return false; } @@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } /* Functions related to oneshot broadcasting */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_broadcast_switch_to_oneshot(void); -extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); extern struct cpumask *tick_get_broadcast_oneshot_mask(void); #else /* !(BROADCAST && ONESHOT): */ static inline void tick_broadcast_switch_to_oneshot(void) { } -static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) +extern void tick_broadcast_offline(unsigned int cpu); +#else +static inline void tick_broadcast_offline(unsigned int cpu) { } +#endif + /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6fa52cd6df0b..f4ee1a3428ae 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. + * + * If nohz_full is enabled, this should not happen because the + * tick_do_timer_cpu never relinquishes. */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) - && !tick_nohz_full_cpu(cpu)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { +#ifdef CONFIG_NO_HZ_FULL + WARN_ON(tick_nohz_full_running); +#endif tick_do_timer_cpu = cpu; + } #endif /* Check, if the jiffies need an update */ @@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) static int tick_nohz_cpu_down(unsigned int cpu) { /* - * The boot CPU handles housekeeping duty (unbound timers, - * workqueues, timekeeping, ...) on behalf of full dynticks + * The tick_do_timer_cpu CPU handles housekeeping duty (unbound + * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) @@ -423,12 +429,15 @@ void __init tick_nohz_init(void) return; } - cpu = smp_processor_id(); + if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && + !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { + cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", - cpu); - cpumask_clear_cpu(cpu, tick_nohz_full_mask); + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warn("NO_HZ: Clearing %d from nohz_full range " + "for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } } for_each_cpu(cpu, tick_nohz_full_mask) @@ -645,7 +654,8 @@ static inline bool local_timer_softirq_pending(void) static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; - unsigned long seq, basejiff; + unsigned long basejiff; + unsigned int seq; /* Read jiffies and the time when jiffies were updated last */ do { @@ -904,8 +914,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) /* * Boot safety: make sure the timekeeping duty has been * assigned before entering dyntick-idle mode, + * tick_do_timer_cpu is TICK_DO_TIMER_BOOT */ - if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT)) + return false; + + /* Should not happen for nohz-full */ + if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) return false; } @@ -1023,6 +1038,18 @@ bool tick_nohz_idle_got_tick(void) } /** + * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer + * or the tick, whatever that expires first. Note that, if the tick has been + * stopped, it returns the next hrtimer. + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_next_hrtimer(void) +{ + return __this_cpu_read(tick_cpu_device.evtdev)->next_event; +} + +/** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 6de959a854b2..4fb06527cf64 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -24,12 +24,19 @@ enum tick_nohz_mode { * struct tick_sched - sched tick emulation and no idle tick control/stats * @sched_timer: hrtimer to schedule the periodic tick in high * resolution mode + * @check_clocks: Notification mechanism about clocksource changes + * @nohz_mode: Mode - one state of tick_nohz_mode + * @inidle: Indicator that the CPU is in the tick idle mode + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_active: Indicator that the CPU is actively in the tick idle mode; + * it is resetted during irq handling phases. + * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set * @last_tick: Store the last tick expiry time when the tick * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. * @next_tick: Next tick to be fired when in dynticks mode. - * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls * @idle_sleeps: Number of idle calls, where the sched tick was stopped @@ -40,8 +47,8 @@ enum tick_nohz_mode { * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) * @timer_expires_base: Base time clock monotonic for @timer_expires - * @do_timer_lst: CPU was the last one doing do_timer before going idle - * @got_idle_tick: Tick timer function has run with @inidle set + * @next_timer: Expiry time of next expiring timer for debugging purpose only + * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick */ struct tick_sched { struct hrtimer sched_timer; diff --git a/kernel/time/time.c b/kernel/time/time.c index 2edb5088a70b..86656bbac232 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -98,11 +98,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME */ -#ifdef CONFIG_COMPAT -#ifdef __ARCH_WANT_COMPAT_SYS_TIME +#ifdef CONFIG_COMPAT_32BIT_TIME +#ifdef __ARCH_WANT_SYS_TIME32 /* old_time32_t is a 32 bit "long" and needs to get converted. */ -COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) +SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc) { old_time32_t i; @@ -116,7 +116,7 @@ COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) return i; } -COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) +SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) { struct timespec64 tv; int err; @@ -134,7 +134,7 @@ COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) return 0; } -#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ +#endif /* __ARCH_WANT_SYS_TIME32 */ #endif SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, @@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz static int firsttime = 1; int error = 0; - if (tv && !timespec64_valid(tv)) + if (tv && !timespec64_valid_settod(tv)) return -EINVAL; error = security_settime64(tv, tz); @@ -263,35 +263,99 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, } #endif -SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) +#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) { - struct timex txc; /* Local copy of parameter */ + struct __kernel_timex txc; /* Local copy of parameter */ int ret; /* Copy the user data space into the kernel copy * structure. But bear in mind that the structures * may change */ - if (copy_from_user(&txc, txc_p, sizeof(struct timex))) + if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex))) return -EFAULT; ret = do_adjtimex(&txc); - return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; + return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret; } +#endif -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME +int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp) +{ + struct old_timex32 tx32; + + memset(txc, 0, sizeof(struct __kernel_timex)); + if (copy_from_user(&tx32, utp, sizeof(struct old_timex32))) + return -EFAULT; + + txc->modes = tx32.modes; + txc->offset = tx32.offset; + txc->freq = tx32.freq; + txc->maxerror = tx32.maxerror; + txc->esterror = tx32.esterror; + txc->status = tx32.status; + txc->constant = tx32.constant; + txc->precision = tx32.precision; + txc->tolerance = tx32.tolerance; + txc->time.tv_sec = tx32.time.tv_sec; + txc->time.tv_usec = tx32.time.tv_usec; + txc->tick = tx32.tick; + txc->ppsfreq = tx32.ppsfreq; + txc->jitter = tx32.jitter; + txc->shift = tx32.shift; + txc->stabil = tx32.stabil; + txc->jitcnt = tx32.jitcnt; + txc->calcnt = tx32.calcnt; + txc->errcnt = tx32.errcnt; + txc->stbcnt = tx32.stbcnt; + + return 0; +} + +int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc) +{ + struct old_timex32 tx32; + + memset(&tx32, 0, sizeof(struct old_timex32)); + tx32.modes = txc->modes; + tx32.offset = txc->offset; + tx32.freq = txc->freq; + tx32.maxerror = txc->maxerror; + tx32.esterror = txc->esterror; + tx32.status = txc->status; + tx32.constant = txc->constant; + tx32.precision = txc->precision; + tx32.tolerance = txc->tolerance; + tx32.time.tv_sec = txc->time.tv_sec; + tx32.time.tv_usec = txc->time.tv_usec; + tx32.tick = txc->tick; + tx32.ppsfreq = txc->ppsfreq; + tx32.jitter = txc->jitter; + tx32.shift = txc->shift; + tx32.stabil = txc->stabil; + tx32.jitcnt = txc->jitcnt; + tx32.calcnt = txc->calcnt; + tx32.errcnt = txc->errcnt; + tx32.stbcnt = txc->stbcnt; + tx32.tai = txc->tai; + if (copy_to_user(utp, &tx32, sizeof(struct old_timex32))) + return -EFAULT; + return 0; +} -COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) +SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) { - struct timex txc; + struct __kernel_timex txc; int err, ret; - err = compat_get_timex(&txc, utp); + err = get_old_timex32(&txc, utp); if (err) return err; ret = do_adjtimex(&txc); - err = compat_put_timex(utp, &txc); + err = put_old_timex32(utp, &txc); if (err) return err; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ac5dbf2cd4a2..5716e28bfa3c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -720,7 +720,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -829,7 +829,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { ktime_t *offset = offsets[offs]; - unsigned long seq; + unsigned int seq; ktime_t tconv; do { @@ -960,7 +960,7 @@ time64_t __ktime_get_real_seconds(void) void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; ktime_t base_raw; ktime_t base_real; u64 nsec_raw; @@ -1122,7 +1122,7 @@ int get_device_system_crosststamp(int (*get_time_fn) ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; - unsigned long seq; + unsigned int seq; bool do_interp; int ret; @@ -1221,7 +1221,7 @@ int do_settimeofday64(const struct timespec64 *ts) unsigned long flags; int ret = 0; - if (!timespec64_valid_strict(ts)) + if (!timespec64_valid_settod(ts)) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1278,7 +1278,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tk), *ts); if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || - !timespec64_valid_strict(&tmp)) { + !timespec64_valid_settod(&tmp)) { ret = -EINVAL; goto error; } @@ -1409,7 +1409,7 @@ int timekeeping_notify(struct clocksource *clock) void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 nsecs; do { @@ -1431,7 +1431,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64); int timekeeping_valid_for_hres(void) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; int ret; do { @@ -1450,7 +1450,7 @@ int timekeeping_valid_for_hres(void) u64 timekeeping_max_deferment(void) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 ret; do { @@ -1527,7 +1527,7 @@ void __init timekeeping_init(void) unsigned long flags; read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); - if (timespec64_valid_strict(&wall_time) && + if (timespec64_valid_settod(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; } else if (timespec64_to_ns(&wall_time) != 0) { @@ -2150,7 +2150,7 @@ EXPORT_SYMBOL_GPL(getboottime64); void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); @@ -2164,7 +2164,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; - unsigned long seq; + unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); @@ -2234,7 +2234,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /** * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2300,7 +2300,7 @@ static int timekeeping_validate_timex(const struct timex *txc) /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ -int do_adjtimex(struct timex *txc) +int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 7a9b4eb7a1d5..141ab3ab0354 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -14,6 +14,13 @@ extern u64 timekeeping_max_deferment(void); extern void timekeeping_warp_clock(void); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); +#ifdef CONFIG_GENERIC_SCHED_CLOCK +extern int sched_clock_suspend(void); +extern void sched_clock_resume(void); +#else +static inline int sched_clock_suspend(void) { return 0; } +static inline void sched_clock_resume(void) { } +#endif extern void do_timer(unsigned long ticks); extern void update_wall_time(void); diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 86489950d690..b73e8850e58d 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -37,15 +37,8 @@ DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time); static int __init tk_debug_sleep_time_init(void) { - struct dentry *d; - - d = debugfs_create_file("sleep_time", 0444, NULL, NULL, - &tk_debug_sleep_time_fops); - if (!d) { - pr_err("Failed to create sleep_time debug file\n"); - return -ENOMEM; - } - + debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); return 0; } late_initcall(tk_debug_sleep_time_init); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 444156debfa0..343c7ba33b1c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); + + trace_timer_start(timer, timer->expires, timer->flags); } static void @@ -647,7 +649,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: WARN_ON(1); - + /* fall through */ default: return false; } @@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer) trace_timer_init(timer); } -static inline void -debug_activate(struct timer_list *timer, unsigned long expires) -{ - debug_timer_activate(timer); - trace_timer_start(timer, expires, timer->flags); -} - static inline void debug_deactivate(struct timer_list *timer) { debug_timer_deactivate(timer); @@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } - debug_activate(timer, expires); + debug_timer_activate(timer); timer->expires = expires; /* @@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu) } forward_timer_base(base); - debug_activate(timer, timer->expires); + debug_timer_activate(timer); internal_add_timer(base, timer); raw_spin_unlock_irqrestore(&base->lock, flags); } @@ -1298,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) +static void call_timer_fn(struct timer_list *timer, + void (*fn)(struct timer_list *), + unsigned long baseclk) { int count = preempt_count(); @@ -1321,14 +1318,14 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list */ lock_map_acquire(&lockdep_map); - trace_timer_expire_entry(timer); + trace_timer_expire_entry(timer, baseclk); fn(timer); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); if (count != preempt_count()) { - WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent @@ -1342,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list static void expire_timers(struct timer_base *base, struct hlist_head *head) { + /* + * This value is required only for tracing. base->clk was + * incremented directly before expire_timers was called. But expiry + * is related to the old base->clk value. + */ + unsigned long baseclk = base->clk - 1; + while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(struct timer_list *); @@ -1355,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); - call_timer_fn(timer, fn); + call_timer_fn(timer, fn, baseclk); raw_spin_lock(&base->lock); } else { raw_spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn); + call_timer_fn(timer, fn, baseclk); raw_spin_lock_irq(&base->lock); } } @@ -1632,7 +1636,7 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); - rcu_check_callbacks(user_tick); + rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_tick(); diff --git a/kernel/torture.c b/kernel/torture.c index bbf6d473e50c..17b2be9bde12 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Common functions for in-kernel torture tests. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2014 * - * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> * Based on kernel/rcu/torture.c. */ @@ -53,7 +40,7 @@ #include "rcu/rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); static char *torture_type; static int verbose; @@ -75,6 +62,7 @@ static DEFINE_MUTEX(fullstop_mutex); static struct task_struct *onoff_task; static long onoff_holdoff; static long onoff_interval; +static torture_ofl_func *onoff_f; static long n_offline_attempts; static long n_offline_successes; static unsigned long sum_offline; @@ -100,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; + if (num_online_cpus() <= 1) + return false; /* Can't offline the last CPU. */ if (verbose > 1) pr_alert("%s" TORTURE_FLAG @@ -118,6 +108,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlined %d\n", torture_type, cpu); + if (onoff_f) + onoff_f(); (*n_offl_successes)++; delta = jiffies - starttime; *sum_offl += delta; @@ -243,11 +235,12 @@ stop: /* * Initiate online-offline handling. */ -int torture_onoff_init(long ooholdoff, long oointerval) +int torture_onoff_init(long ooholdoff, long oointerval, torture_ofl_func *f) { #ifdef CONFIG_HOTPLUG_CPU onoff_holdoff = ooholdoff; onoff_interval = oointerval; + onoff_f = f; if (onoff_interval <= 0) return 0; return torture_create_kthread(torture_onoff, NULL, onoff_task); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fa8b1fe824f3..8bd1d6d001d7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -370,6 +370,7 @@ config PROFILE_ANNOTATED_BRANCHES config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING + imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fac0ddf8a8e2..e1c6d79fb4cc 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -723,6 +723,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #endif case BLKTRACESTART: start = 1; + /* fall through */ case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f1a86a0d881d..94b0e37d90ef 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,6 +14,8 @@ #include <linux/syscalls.h> #include <linux/error-injection.h> +#include <asm/tlb.h> + #include "trace_probe.h" #include "trace.h" @@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, * access_ok() should prevent writing to non-user memory, but in * some situations (nommu, temporary switch, etc) access_ok() does * not provide enough validation, hence the check on KERNEL_DS. + * + * nmi_uaccess_okay() ensures the probe is not run in an interim + * state, when the task or mm are switched. This is specifically + * required to prevent the use of temporary mm. */ if (unlikely(in_interrupt() || @@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, return -EPERM; if (unlikely(uaccess_kernel())) return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; if (!access_ok(unsafe_ptr, size)) return -EPERM; @@ -431,8 +439,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_event_output(event, sd, regs); - return 0; + return perf_event_output(event, sd, regs); } BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index aac7847c0214..b920358dd8f7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,6 +33,7 @@ #include <linux/list.h> #include <linux/hash.h> #include <linux/rcupdate.h> +#include <linux/kprobes.h> #include <trace/events/sched.h> @@ -1992,7 +1993,7 @@ static void print_bug_type(void) * modifying the code. @failed should be one of either: * EFAULT - if the problem happens on reading the @ip address * EINVAL - if what is read at @ip is not what was expected - * EPERM - if the problem happens on writting to the @ip address + * EPERM - if the problem happens on writing to the @ip address */ void ftrace_bug(int failed, struct dyn_ftrace *rec) { @@ -2391,7 +2392,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } - return -1; /* unknow ftrace bug */ + return -1; /* unknown ftrace bug */ } void __weak ftrace_replace_code(int mod_flags) @@ -3004,7 +3005,7 @@ ftrace_allocate_pages(unsigned long num_to_init) int cnt; if (!num_to_init) - return 0; + return NULL; start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); if (!pg) @@ -3702,6 +3703,31 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) } static int +add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, + int clear_filter) +{ + long index = simple_strtoul(func_g->search, NULL, 0); + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + /* The index starts at 1 */ + if (--index < 0) + return 0; + + do_for_each_ftrace_rec(pg, rec) { + if (pg->index <= index) { + index -= pg->index; + /* this is a double loop, break goes to the next page */ + break; + } + rec = &pg->records[index]; + enter_record(hash, rec, clear_filter); + return 1; + } while_for_each_ftrace_rec(); + return 0; +} + +static int ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, struct ftrace_glob *mod_g, int exclude_mod) { @@ -3769,6 +3795,11 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) if (unlikely(ftrace_disabled)) goto out_unlock; + if (func_g.type == MATCH_INDEX) { + found = add_rec_by_index(hash, &func_g, clear_filter); + goto out_unlock; + } + do_for_each_ftrace_rec(pg, rec) { if (rec->flags & FTRACE_FL_DISABLED) @@ -4725,7 +4756,7 @@ static int ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, int reset, int enable) { - return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); } /** @@ -5433,7 +5464,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops, /* * The name "destroy_filter_files" is really a misnomer. Although - * in the future, it may actualy delete the files, but this is + * in the future, it may actually delete the files, but this is * really intended to make sure the ops passed in are disabled * and that when this function returns, the caller is free to * free the ops. @@ -5756,7 +5787,7 @@ void ftrace_module_enable(struct module *mod) /* * If the tracing is enabled, go ahead and enable the record. * - * The reason not to enable the record immediatelly is the + * The reason not to enable the record immediately is the * inherent check of ftrace_make_nop/ftrace_make_call for * correct previous instructions. Making first the NOP * conversion puts the module to the correct state, thus @@ -6216,7 +6247,7 @@ void ftrace_reset_array_ops(struct trace_array *tr) tr->ops->func = ftrace_stub; } -static inline void +static nokprobe_inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { @@ -6276,11 +6307,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, { __ftrace_ops_list_func(ip, parent_ip, NULL, regs); } +NOKPROBE_SYMBOL(ftrace_ops_list_func); #else static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } +NOKPROBE_SYMBOL(ftrace_ops_no_ops); #endif /* @@ -6307,6 +6340,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); trace_clear_recursion(bit); } +NOKPROBE_SYMBOL(ftrace_ops_assist_func); /** * ftrace_ops_get_func - get the function a trampoline should call diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 06e864a334bb..4ee8d8aa3d0f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -353,20 +353,6 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } -/** - * ring_buffer_page_len - the size of data on the page. - * @page: The page to read - * - * Returns the amount of data on the page, including buffer page header. - */ -size_t ring_buffer_page_len(void *page) -{ - struct buffer_data_page *bpage = page; - - return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) - + BUF_PAGE_HDR_SIZE; -} - /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. @@ -776,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) preempt_disable_notrace(); time = rb_time_stamp(buffer); - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return time; } @@ -4205,6 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over + * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate * through the buffer. Memory is allocated, buffer recording @@ -4222,7 +4209,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -4230,7 +4217,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc(sizeof(*iter), flags); if (!iter) return NULL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c521b7347482..ec439999f387 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); +static void ftrace_trace_userstack(struct ring_buffer *buffer, + unsigned long flags, int pc); #define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; @@ -496,8 +498,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, * not modified. */ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) + if (!pid_list) { + trace_parser_put(&parser); return -ENOMEM; + } pid_list->pid_max = READ_ONCE(pid_max); @@ -507,6 +511,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); if (!pid_list->pids) { + trace_parser_put(&parser); kfree(pid_list); return -ENOMEM; } @@ -894,7 +899,7 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -void tracing_snapshot_instance(struct trace_array *tr) +void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) { struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -920,10 +925,15 @@ void tracing_snapshot_instance(struct trace_array *tr) } local_irq_save(flags); - update_max_tr(tr, current, smp_processor_id()); + update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); } +void tracing_snapshot_instance(struct trace_array *tr) +{ + tracing_snapshot_instance_cond(tr, NULL); +} + /** * tracing_snapshot - take a snapshot of the current buffer. * @@ -946,6 +956,54 @@ void tracing_snapshot(void) } EXPORT_SYMBOL_GPL(tracing_snapshot); +/** + * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. + * @tr: The tracing instance to snapshot + * @cond_data: The data to be tested conditionally, and possibly saved + * + * This is the same as tracing_snapshot() except that the snapshot is + * conditional - the snapshot will only happen if the + * cond_snapshot.update() implementation receiving the cond_data + * returns true, which means that the trace array's cond_snapshot + * update() operation used the cond_data to determine whether the + * snapshot should be taken, and if it was, presumably saved it along + * with the snapshot. + */ +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + tracing_snapshot_instance_cond(tr, cond_data); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); + +/** + * tracing_snapshot_cond_data - get the user data associated with a snapshot + * @tr: The tracing instance + * + * When the user enables a conditional snapshot using + * tracing_snapshot_cond_enable(), the user-defined cond_data is saved + * with the snapshot. This accessor is used to retrieve it. + * + * Should not be called from cond_snapshot.update(), since it takes + * the tr->max_lock lock, which the code calling + * cond_snapshot.update() has already done. + * + * Returns the cond_data associated with the trace array's snapshot. + */ +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + void *cond_data = NULL; + + arch_spin_lock(&tr->max_lock); + + if (tr->cond_snapshot) + cond_data = tr->cond_snapshot->cond_data; + + arch_spin_unlock(&tr->max_lock); + + return cond_data; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); + static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, struct trace_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); @@ -1025,12 +1083,111 @@ void tracing_snapshot_alloc(void) tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); + +/** + * tracing_snapshot_cond_enable - enable conditional snapshot for an instance + * @tr: The tracing instance + * @cond_data: User data to associate with the snapshot + * @update: Implementation of the cond_snapshot update function + * + * Check whether the conditional snapshot for the given instance has + * already been enabled, or if the current tracer is already using a + * snapshot; if so, return -EBUSY, else create a cond_snapshot and + * save the cond_data and update function inside. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, + cond_update_fn_t update) +{ + struct cond_snapshot *cond_snapshot; + int ret = 0; + + cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); + if (!cond_snapshot) + return -ENOMEM; + + cond_snapshot->cond_data = cond_data; + cond_snapshot->update = update; + + mutex_lock(&trace_types_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) + goto fail_unlock; + + if (tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto fail_unlock; + } + + /* + * The cond_snapshot can only change to NULL without the + * trace_types_lock. We don't care if we race with it going + * to NULL, but we want to make sure that it's not set to + * something other than NULL when we get here, which we can + * do safely with only holding the trace_types_lock and not + * having to take the max_lock. + */ + if (tr->cond_snapshot) { + ret = -EBUSY; + goto fail_unlock; + } + + arch_spin_lock(&tr->max_lock); + tr->cond_snapshot = cond_snapshot; + arch_spin_unlock(&tr->max_lock); + + mutex_unlock(&trace_types_lock); + + return ret; + + fail_unlock: + mutex_unlock(&trace_types_lock); + kfree(cond_snapshot); + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); + +/** + * tracing_snapshot_cond_disable - disable conditional snapshot for an instance + * @tr: The tracing instance + * + * Check whether the conditional snapshot for the given instance is + * enabled; if so, free the cond_snapshot associated with it, + * otherwise return -EINVAL. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + int ret = 0; + + arch_spin_lock(&tr->max_lock); + + if (!tr->cond_snapshot) + ret = -EINVAL; + else { + kfree(tr->cond_snapshot); + tr->cond_snapshot = NULL; + } + + arch_spin_unlock(&tr->max_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); int tracing_alloc_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); @@ -1043,6 +1200,21 @@ void tracing_snapshot_alloc(void) tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) +{ + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + return false; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1330,7 +1502,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use @@ -1354,12 +1526,14 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) * @tr: tracer * @tsk: the task with the latency * @cpu: The cpu that initiated the trace. + * @cond_data: User data associated with a conditional snapshot * * Flip the buffers between the @tr and the max_tr and record information * about which task was the cause of this latency. */ void -update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data) { if (tr->stop_count) return; @@ -1380,9 +1554,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) else ring_buffer_record_off(tr->max_buffer.buffer); +#ifdef CONFIG_TRACER_SNAPSHOT + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) + goto out_unlock; +#endif swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); + + out_unlock: arch_spin_unlock(&tr->max_lock); } @@ -1748,7 +1928,7 @@ static inline char *get_saved_cmdlines(int idx) static inline void set_cmdline(int idx, const char *cmdline) { - memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); + strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static int allocate_cmdlines_buffer(unsigned int val, @@ -2574,12 +2754,21 @@ trace_function(struct trace_array *tr, #ifdef CONFIG_STACKTRACE -#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +/* Allow 4 levels of nesting: normal, softirq, irq, NMI */ +#define FTRACE_KSTACK_NESTING 4 + +#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) + struct ftrace_stack { - unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; + unsigned long calls[FTRACE_KSTACK_ENTRIES]; +}; + + +struct ftrace_stacks { + struct ftrace_stack stacks[FTRACE_KSTACK_NESTING]; }; -static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); static void __ftrace_trace_stack(struct ring_buffer *buffer, @@ -2588,13 +2777,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, { struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; + unsigned int size, nr_entries; + struct ftrace_stack *fstack; struct stack_entry *entry; - struct stack_trace trace; - int use_stack; - int size = FTRACE_STACK_ENTRIES; - - trace.nr_entries = 0; - trace.skip = skip; + int stackidx; /* * Add one, for this function and the call to save_stack_trace() @@ -2602,7 +2788,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ #ifndef CONFIG_UNWINDER_ORC if (!regs) - trace.skip++; + skip++; #endif /* @@ -2613,53 +2799,40 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ preempt_disable_notrace(); - use_stack = __this_cpu_inc_return(ftrace_stack_reserve); + stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; + + /* This should never happen. If it does, yell once and skip */ + if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) + goto out; + /* - * We don't need any atomic variables, just a barrier. - * If an interrupt comes in, we don't care, because it would - * have exited and put the counter back to what we want. - * We just need a barrier to keep gcc from moving things - * around. + * The above __this_cpu_inc_return() is 'atomic' cpu local. An + * interrupt will either see the value pre increment or post + * increment. If the interrupt happens pre increment it will have + * restored the counter when it returns. We just need a barrier to + * keep gcc from moving things around. */ barrier(); - if (use_stack == 1) { - trace.entries = this_cpu_ptr(ftrace_stack.calls); - trace.max_entries = FTRACE_STACK_MAX_ENTRIES; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); - - if (trace.nr_entries > size) - size = trace.nr_entries; - } else - /* From now on, use_stack is a boolean */ - use_stack = 0; + fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; + size = ARRAY_SIZE(fstack->calls); - size *= sizeof(unsigned long); + if (regs) { + nr_entries = stack_trace_save_regs(regs, fstack->calls, + size, skip); + } else { + nr_entries = stack_trace_save(fstack->calls, size, skip); + } + size = nr_entries * sizeof(unsigned long); event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size, flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); - memset(&entry->caller, 0, size); - - if (use_stack) - memcpy(&entry->caller, trace.entries, - trace.nr_entries * sizeof(unsigned long)); - else { - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.entries = entry->caller; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); - } - - entry->size = trace.nr_entries; + memcpy(&entry->caller, fstack->calls, size); + entry->size = nr_entries; if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); @@ -2729,15 +2902,15 @@ void trace_dump_stack(int skip) } EXPORT_SYMBOL_GPL(trace_dump_stack); +#ifdef CONFIG_USER_STACKTRACE_SUPPORT static DEFINE_PER_CPU(int, user_stack_count); -void +static void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; - struct stack_trace trace; if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) return; @@ -2768,12 +2941,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = 0; - trace.entries = entry->caller; - - save_stack_trace_user(&trace); + stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); @@ -2782,13 +2950,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) out: preempt_enable(); } - -#ifdef UNUSED -static void __trace_userstack(struct trace_array *tr, unsigned long flags) +#else /* CONFIG_USER_STACKTRACE_SUPPORT */ +static void ftrace_trace_userstack(struct ring_buffer *buffer, + unsigned long flags, int pc) { - ftrace_trace_userstack(tr, flags, preempt_count()); } -#endif /* UNUSED */ +#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* CONFIG_STACKTRACE */ @@ -3384,6 +3551,8 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file const char tgid_space[] = " "; const char space[] = " "; + print_event_info(buf, m); + seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? tgid_space : space); seq_printf(m, "# %s / _----=> need-resched\n", @@ -3902,7 +4071,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -3912,7 +4082,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); @@ -4700,6 +4871,7 @@ static const char readme_msg[] = "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" + "\t [:<handler>.<action>]\n" "\t [if <filter>]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" @@ -4740,8 +4912,21 @@ static const char readme_msg[] = "\t unchanged.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" - "\t already-attached hist trigger. The syntax is analagous to\n" - "\t the enable_event and disable_event triggers.\n" + "\t already-attached hist trigger. The syntax is analogous to\n" + "\t the enable_event and disable_event triggers.\n\n" + "\t Hist trigger handlers and actions are executed whenever a\n" + "\t a histogram entry is added or updated. They take the form:\n\n" + "\t <handler>.<action>\n\n" + "\t The available handlers are:\n\n" + "\t onmatch(matching.event) - invoke on addition or update\n" + "\t onmax(var) - invoke if var exceeds current max\n" + "\t onchange(var) - invoke action if var changes\n\n" + "\t The available actions are:\n\n" + "\t trace(<synthetic_event>,param list) - generate synthetic event\n" + "\t save(field,...) - save current event fields\n" +#ifdef CONFIG_TRACER_SNAPSHOT + "\t snapshot() - snapshot the trace buffer\n" +#endif #endif ; @@ -5386,6 +5571,16 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t == tr->current_trace) goto out; +#ifdef CONFIG_TRACER_SNAPSHOT + if (t->use_max_tr) { + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + if (ret) + goto out; + } +#endif /* Some tracers won't work on kernel command line */ if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", @@ -5624,7 +5819,6 @@ out: return ret; fail: - kfree(iter->trace); kfree(iter); __trace_array_put(tr); mutex_unlock(&trace_types_lock); @@ -5823,7 +6017,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, } static const struct pipe_buf_operations tracing_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -6468,6 +6661,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, goto out; } + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + if (ret) + goto out; + switch (val) { case 0: if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { @@ -6493,7 +6693,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, local_irq_disable(); /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - update_max_tr(tr, current, smp_processor_id()); + update_max_tr(tr, current, smp_processor_id(), NULL); else update_max_tr_single(tr, current, iter->cpu_file); local_irq_enable(); @@ -6817,36 +7017,43 @@ struct buffer_ref { struct ring_buffer *buffer; void *page; int cpu; - int ref; + refcount_t refcount; }; +static void buffer_ref_release(struct buffer_ref *ref) +{ + if (!refcount_dec_and_test(&ref->refcount)) + return; + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); + kfree(ref); +} + static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); buf->private = 0; } -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - ref->ref++; + if (refcount_read(&ref->refcount) > INT_MAX/2) + return false; + + refcount_inc(&ref->refcount); + return true; } /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; @@ -6859,11 +7066,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) struct buffer_ref *ref = (struct buffer_ref *)spd->partial[i].private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); spd->partial[i].private = 0; } @@ -6918,7 +7121,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - ref->ref = 1; + refcount_set(&ref->refcount, 1); ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (IS_ERR(ref->page)) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 08900828d282..639047b259d7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -194,6 +194,51 @@ struct trace_pid_list { unsigned long *pids; }; +typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); + +/** + * struct cond_snapshot - conditional snapshot data and callback + * + * The cond_snapshot structure encapsulates a callback function and + * data associated with the snapshot for a given tracing instance. + * + * When a snapshot is taken conditionally, by invoking + * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is + * passed in turn to the cond_snapshot.update() function. That data + * can be compared by the update() implementation with the cond_data + * contained wihin the struct cond_snapshot instance associated with + * the trace_array. Because the tr->max_lock is held throughout the + * update() call, the update() function can directly retrieve the + * cond_snapshot and cond_data associated with the per-instance + * snapshot associated with the trace_array. + * + * The cond_snapshot.update() implementation can save data to be + * associated with the snapshot if it decides to, and returns 'true' + * in that case, or it returns 'false' if the conditional snapshot + * shouldn't be taken. + * + * The cond_snapshot instance is created and associated with the + * user-defined cond_data by tracing_cond_snapshot_enable(). + * Likewise, the cond_snapshot instance is destroyed and is no longer + * associated with the trace instance by + * tracing_cond_snapshot_disable(). + * + * The method below is required. + * + * @update: When a conditional snapshot is invoked, the update() + * callback function is invoked with the tr->max_lock held. The + * update() implementation signals whether or not to actually + * take the snapshot, by returning 'true' if so, 'false' if no + * snapshot should be taken. Because the max_lock is held for + * the duration of update(), the implementation is safe to + * directly retrieven and save any implementation data it needs + * to in association with the snapshot. + */ +struct cond_snapshot { + void *cond_data; + cond_update_fn_t update; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -277,6 +322,9 @@ struct trace_array { #endif int time_stamp_abs_ref; struct list_head hist_vars; +#ifdef CONFIG_TRACER_SNAPSHOT + struct cond_snapshot *cond_snapshot; +#endif }; enum { @@ -727,23 +775,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, const char __user *ubuf, size_t cnt); #ifdef CONFIG_TRACER_MAX_TRACE -void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef CONFIG_STACKTRACE -void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, - int pc); - void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); #else -static inline void ftrace_trace_userstack(struct ring_buffer *buffer, - unsigned long flags, int pc) -{ -} - static inline void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { @@ -855,10 +896,11 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_IRQS 0x40 -#define TRACE_GRAPH_PRINT_TAIL 0x80 -#define TRACE_GRAPH_SLEEP_TIME 0x100 -#define TRACE_GRAPH_GRAPH_TIME 0x200 +#define TRACE_GRAPH_PRINT_REL_TIME 0x40 +#define TRACE_GRAPH_PRINT_IRQS 0x80 +#define TRACE_GRAPH_PRINT_TAIL 0x100 +#define TRACE_GRAPH_SLEEP_TIME 0x200 +#define TRACE_GRAPH_GRAPH_TIME 0x400 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -1458,6 +1500,7 @@ enum regex_type { MATCH_MIDDLE_ONLY, MATCH_END_ONLY, MATCH_GLOB, + MATCH_INDEX, }; struct regex { @@ -1808,6 +1851,11 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops) extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); +extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); +extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); + +extern int tracing_snapshot_cond_disable(struct trace_array *tr); +extern void *tracing_cond_snapshot_data(struct trace_array *tr); extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4ad967453b6f..3ea65cdff30d 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect, int is_constant) { + unsigned long flags = user_access_save(); + /* A constant is always correct */ if (is_constant) { f->constant++; @@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, f->data.correct++; else f->data.incorrect++; + + user_access_restore(flags); } EXPORT_SYMBOL(ftrace_likely_update); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index dd1f43588d70..fa100ed3b4de 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -74,7 +74,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) static int create_dyn_event(int argc, char **argv) { struct dyn_event_operations *ops; - int ret; + int ret = -ENODEV; if (argv[0][0] == '-' || argv[0][0] == '!') return dyn_event_release(argc, argv, NULL); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 06bb2fd9a56c..fc8e97328e54 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -65,7 +65,8 @@ FTRACE_ENTRY_REG(function, ftrace_entry, __field( unsigned long, parent_ip ) ), - F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + F_printk(" %ps <-- %ps", + (void *)__entry->ip, (void *)__entry->parent_ip), FILTER_TRACE_FN, @@ -83,7 +84,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->func, __entry->depth), + F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth), FILTER_OTHER ); @@ -102,8 +103,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_desc( int, ret, depth ) ), - F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", - __entry->func, __entry->depth, + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", + (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth), @@ -167,12 +168,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, #define FTRACE_STACK_ENTRIES 8 -#ifndef CONFIG_64BIT -# define IP_FMT "%08lx" -#else -# define IP_FMT "%016lx" -#endif - FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, @@ -182,12 +177,13 @@ FTRACE_ENTRY(kernel_stack, stack_entry, __dynamic_array(unsigned long, caller ) ), - F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", - __entry->caller[0], __entry->caller[1], __entry->caller[2], - __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]), + F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n", + (void *)__entry->caller[0], (void *)__entry->caller[1], + (void *)__entry->caller[2], (void *)__entry->caller[3], + (void *)__entry->caller[4], (void *)__entry->caller[5], + (void *)__entry->caller[6], (void *)__entry->caller[7]), FILTER_OTHER ); @@ -201,12 +197,13 @@ FTRACE_ENTRY(user_stack, userstack_entry, __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) ), - F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", - __entry->caller[0], __entry->caller[1], __entry->caller[2], - __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]), + F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n", + (void *)__entry->caller[0], (void *)__entry->caller[1], + (void *)__entry->caller[2], (void *)__entry->caller[3], + (void *)__entry->caller[4], (void *)__entry->caller[5], + (void *)__entry->caller[6], (void *)__entry->caller[7]), FILTER_OTHER ); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 76217bbef815..4629a6104474 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -299,15 +299,13 @@ int perf_uprobe_init(struct perf_event *p_event, if (!p_event->attr.uprobe_path) return -EINVAL; - path = kzalloc(PATH_MAX, GFP_KERNEL); - if (!path) - return -ENOMEM; - ret = strncpy_from_user( - path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); - if (ret == PATH_MAX) - return -E2BIG; - if (ret < 0) - goto out; + + path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path), + PATH_MAX); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + return (ret == -EINVAL) ? -E2BIG : ret; + } if (path[0] == '\0') { ret = -EINVAL; goto out; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 27821480105e..05a66493a164 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -491,10 +491,12 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, break; case '&': case '|': + /* accepting only "&&" or "||" */ if (next[1] == next[0]) { ptr++; break; } + /* fall through */ default: parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); @@ -823,6 +825,9 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) *search = buff; + if (isdigit(buff[0])) + return MATCH_INDEX; + for (i = 0; i < len; i++) { if (buff[i] == '*') { if (!i) { @@ -860,6 +865,8 @@ static void filter_build_regex(struct filter_pred *pred) } switch (type) { + /* MATCH_INDEX should not happen, but if it does, match full */ + case MATCH_INDEX: case MATCH_FULL: r->match = regex_match_full; break; @@ -1301,7 +1308,7 @@ static int parse_pred(const char *str, void *data, /* go past the last quote */ i++; - } else if (isdigit(str[i])) { + } else if (isdigit(str[i]) || str[i] == '-') { /* Make sure the field is not a string */ if (is_string_field(field)) { @@ -1314,6 +1321,9 @@ static int parse_pred(const char *str, void *data, goto err_free; } + if (str[i] == '-') + i++; + /* We allow 0xDEADBEEF */ while (isalnum(str[i])) i++; diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 449d90cfa151..a1d20421f4b0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -313,9 +313,9 @@ struct hist_trigger_data { struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX]; unsigned int n_field_var_hists; - struct field_var *max_vars[SYNTH_FIELDS_MAX]; - unsigned int n_max_vars; - unsigned int n_max_var_str; + struct field_var *save_vars[SYNTH_FIELDS_MAX]; + unsigned int n_save_vars; + unsigned int n_save_var_str; }; static int synth_event_create(int argc, const char **argv); @@ -383,41 +383,157 @@ struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, + struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals); +typedef bool (*check_track_val_fn_t) (u64 track_val, u64 var_val); + +enum handler_id { + HANDLER_ONMATCH = 1, + HANDLER_ONMAX, + HANDLER_ONCHANGE, +}; + +enum action_id { + ACTION_SAVE = 1, + ACTION_TRACE, + ACTION_SNAPSHOT, +}; + struct action_data { + enum handler_id handler; + enum action_id action; + char *action_name; action_fn_t fn; + unsigned int n_params; char *params[SYNTH_FIELDS_MAX]; + /* + * When a histogram trigger is hit, the values of any + * references to variables, including variables being passed + * as parameters to synthetic events, are collected into a + * var_ref_vals array. This var_ref_idx is the index of the + * first param in the array to be passed to the synthetic + * event invocation. + */ + unsigned int var_ref_idx; + struct synth_event *synth_event; + bool use_trace_keyword; + char *synth_event_name; + union { struct { - /* - * When a histogram trigger is hit, the values of any - * references to variables, including variables being passed - * as parameters to synthetic events, are collected into a - * var_ref_vals array. This var_ref_idx is the index of the - * first param in the array to be passed to the synthetic - * event invocation. - */ - unsigned int var_ref_idx; - char *match_event; - char *match_event_system; - char *synth_event_name; - struct synth_event *synth_event; - } onmatch; + char *event; + char *event_system; + } match_data; struct { + /* + * var_str contains the $-unstripped variable + * name referenced by var_ref, and used when + * printing the action. Because var_ref + * creation is deferred to create_actions(), + * we need a per-action way to save it until + * then, thus var_str. + */ char *var_str; - char *fn_name; - unsigned int max_var_ref_idx; - struct hist_field *max_var; - struct hist_field *var; - } onmax; + + /* + * var_ref refers to the variable being + * tracked e.g onmax($var). + */ + struct hist_field *var_ref; + + /* + * track_var contains the 'invisible' tracking + * variable created to keep the current + * e.g. max value. + */ + struct hist_field *track_var; + + check_track_val_fn_t check_val; + action_fn_t save_data; + } track_data; }; }; +struct track_data { + u64 track_val; + bool updated; + + unsigned int key_len; + void *key; + struct tracing_map_elt elt; + + struct action_data *action_data; + struct hist_trigger_data *hist_data; +}; + +struct hist_elt_data { + char *comm; + u64 *var_ref_vals; + char *field_var_str[SYNTH_FIELDS_MAX]; +}; + +struct snapshot_context { + struct tracing_map_elt *elt; + void *key; +}; + +static void track_data_free(struct track_data *track_data) +{ + struct hist_elt_data *elt_data; + + if (!track_data) + return; + + kfree(track_data->key); + + elt_data = track_data->elt.private_data; + if (elt_data) { + kfree(elt_data->comm); + kfree(elt_data); + } + + kfree(track_data); +} + +static struct track_data *track_data_alloc(unsigned int key_len, + struct action_data *action_data, + struct hist_trigger_data *hist_data) +{ + struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + struct hist_elt_data *elt_data; + + if (!data) + return ERR_PTR(-ENOMEM); + + data->key = kzalloc(key_len, GFP_KERNEL); + if (!data->key) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + + data->key_len = key_len; + data->action_data = action_data; + data->hist_data = hist_data; + + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + data->elt.private_data = elt_data; + + elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!elt_data->comm) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + + return data; +} static char last_hist_cmd[MAX_FILTER_STR_VAL]; static char hist_err_str[MAX_FILTER_STR_VAL]; @@ -1078,12 +1194,12 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, static void action_trace(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, + struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { - struct synth_event *event = data->onmatch.synth_event; + struct synth_event *event = data->synth_event; - trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx); + trace_synth(event, var_ref_vals, data->var_ref_idx); } struct hist_var_data { @@ -1200,8 +1316,8 @@ static int synth_event_create(int argc, const char **argv) /* This interface accepts group name prefix */ if (strchr(name, '/')) { - len = sizeof(SYNTH_SYSTEM "/") - 1; - if (strncmp(name, SYNTH_SYSTEM "/", len)) + len = str_has_prefix(name, SYNTH_SYSTEM "/"); + if (len == 0) return -EINVAL; name += len; } @@ -1644,9 +1760,9 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) { - char *system = data->onmatch.match_event_system; - char *event_name = data->onmatch.match_event; + if (data->handler == HANDLER_ONMATCH) { + char *system = data->match_data.event_system; + char *event_name = data->match_data.event; file = find_var_file(tr, system, event_name, var_name); if (!file) @@ -1691,12 +1807,6 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, return hist_field; } -struct hist_elt_data { - char *comm; - u64 *var_ref_vals; - char *field_var_str[SYNTH_FIELDS_MAX]; -}; - static u64 hist_field_var_ref(struct hist_field *hist_field, struct tracing_map_elt *elt, struct ring_buffer_event *rbe, @@ -1882,7 +1992,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) return ret; if ((str_has_prefix(str, "onmatch(")) || - (str_has_prefix(str, "onmax("))) { + (str_has_prefix(str, "onmax(")) || + (str_has_prefix(str, "onchange("))) { attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); if (!attrs->action_str[attrs->n_actions]) { ret = -ENOMEM; @@ -2030,7 +2141,7 @@ static inline void save_comm(char *comm, struct task_struct *task) return; } - memcpy(comm, task->comm, TASK_COMM_LEN); + strncpy(comm, task->comm, TASK_COMM_LEN); } static void hist_elt_data_free(struct hist_elt_data *elt_data) @@ -2076,7 +2187,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } - n_str = hist_data->n_field_var_str + hist_data->n_max_var_str; + n_str = hist_data->n_field_var_str + hist_data->n_save_var_str; size = STR_VAR_LEN_MAX; @@ -3050,7 +3161,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, int ret; if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { - hist_err_event("onmatch: Too many field variables defined: ", + hist_err_event("trace action: Too many field variables defined: ", subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); } @@ -3058,7 +3169,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, file = event_file(tr, subsys_name, event_name); if (IS_ERR(file)) { - hist_err_event("onmatch: Event file not found: ", + hist_err_event("trace action: Event file not found: ", subsys_name, event_name, field_name); ret = PTR_ERR(file); return ERR_PTR(ret); @@ -3072,7 +3183,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, */ hist_data = find_compatible_hist(target_hist_data, file); if (!hist_data) { - hist_err_event("onmatch: Matching event histogram not found: ", + hist_err_event("trace action: Matching event histogram not found: ", subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); } @@ -3134,7 +3245,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, kfree(cmd); kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("onmatch: Couldn't create histogram for field: ", + hist_err_event("trace action: Couldn't create histogram for field: ", subsys_name, event_name, field_name); return ERR_PTR(ret); } @@ -3147,7 +3258,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (IS_ERR_OR_NULL(event_var)) { kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("onmatch: Couldn't find synthetic variable: ", + hist_err_event("trace action: Couldn't find synthetic variable: ", subsys_name, event_name, field_name); return ERR_PTR(-EINVAL); } @@ -3225,13 +3336,13 @@ static void update_field_vars(struct hist_trigger_data *hist_data, hist_data->n_field_vars, 0); } -static void update_max_vars(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - struct ring_buffer_event *rbe, - void *rec) +static void save_track_data_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) { - __update_field_vars(elt, rbe, rec, hist_data->max_vars, - hist_data->n_max_vars, hist_data->n_field_var_str); + __update_field_vars(elt, rbe, rec, hist_data->save_vars, + hist_data->n_save_vars, hist_data->n_field_var_str); } static struct hist_field *create_var(struct hist_trigger_data *hist_data, @@ -3366,18 +3477,190 @@ create_target_field_var(struct hist_trigger_data *target_hist_data, return create_field_var(target_hist_data, file, var_name); } -static void onmax_print(struct seq_file *m, - struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - struct action_data *data) +static bool check_track_val_max(u64 track_val, u64 var_val) +{ + if (var_val <= track_val) + return false; + + return true; +} + +static bool check_track_val_changed(u64 track_val, u64 var_val) +{ + if (var_val == track_val) + return false; + + return true; +} + +static u64 get_track_val(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + unsigned int track_var_idx = data->track_data.track_var->var.idx; + u64 track_val; + + track_val = tracing_map_read_var(elt, track_var_idx); + + return track_val; +} + +static void save_track_val(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data, u64 var_val) +{ + unsigned int track_var_idx = data->track_data.track_var->var.idx; + + tracing_map_set_var(elt, track_var_idx, var_val); +} + +static void save_track_data(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) +{ + if (data->track_data.save_data) + data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); +} + +static bool check_track_val(struct tracing_map_elt *elt, + struct action_data *data, + u64 var_val) +{ + struct hist_trigger_data *hist_data; + u64 track_val; + + hist_data = data->track_data.track_var->hist_data; + track_val = get_track_val(hist_data, elt, data); + + return data->track_data.check_val(track_val, var_val); +} + +#ifdef CONFIG_TRACER_SNAPSHOT +static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) +{ + /* called with tr->max_lock held */ + struct track_data *track_data = tr->cond_snapshot->cond_data; + struct hist_elt_data *elt_data, *track_elt_data; + struct snapshot_context *context = cond_data; + u64 track_val; + + if (!track_data) + return false; + + track_val = get_track_val(track_data->hist_data, context->elt, + track_data->action_data); + + track_data->track_val = track_val; + memcpy(track_data->key, context->key, track_data->key_len); + + elt_data = context->elt->private_data; + track_elt_data = track_data->elt.private_data; + if (elt_data->comm) + strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); + + track_data->updated = true; + + return true; +} + +static void save_track_data_snapshot(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, + u64 *var_ref_vals) +{ + struct trace_event_file *file = hist_data->event_file; + struct snapshot_context context; + + context.elt = elt; + context.key = key; + + tracing_snapshot_cond(file->tr, &context); +} + +static void hist_trigger_print_key(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt); + +static struct action_data *snapshot_action(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + if (!hist_data->n_actions) + return NULL; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->action == ACTION_SNAPSHOT) + return data; + } + + return NULL; +} + +static void track_data_snapshot_print(struct seq_file *m, + struct hist_trigger_data *hist_data) { - unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx; + struct trace_event_file *file = hist_data->event_file; + struct track_data *track_data; + struct action_data *action; - seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx)); + track_data = tracing_cond_snapshot_data(file->tr); + if (!track_data) + return; - for (i = 0; i < hist_data->n_max_vars; i++) { - struct hist_field *save_val = hist_data->max_vars[i]->val; - struct hist_field *save_var = hist_data->max_vars[i]->var; + if (!track_data->updated) + return; + + action = snapshot_action(hist_data); + if (!action) + return; + + seq_puts(m, "\nSnapshot taken (see tracing/snapshot). Details:\n"); + seq_printf(m, "\ttriggering value { %s(%s) }: %10llu", + action->handler == HANDLER_ONMAX ? "onmax" : "onchange", + action->track_data.var_str, track_data->track_val); + + seq_puts(m, "\ttriggered by event with key: "); + hist_trigger_print_key(m, hist_data, track_data->key, &track_data->elt); + seq_putc(m, '\n'); +} +#else +static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) +{ + return false; +} +static void save_track_data_snapshot(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, + u64 *var_ref_vals) {} +static void track_data_snapshot_print(struct seq_file *m, + struct hist_trigger_data *hist_data) {} +#endif /* CONFIG_TRACER_SNAPSHOT */ + +static void track_data_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + u64 track_val = get_track_val(hist_data, elt, data); + unsigned int i, save_var_idx; + + if (data->handler == HANDLER_ONMAX) + seq_printf(m, "\n\tmax: %10llu", track_val); + else if (data->handler == HANDLER_ONCHANGE) + seq_printf(m, "\n\tchanged: %10llu", track_val); + + if (data->action == ACTION_SNAPSHOT) + return; + + for (i = 0; i < hist_data->n_save_vars; i++) { + struct hist_field *save_val = hist_data->save_vars[i]->val; + struct hist_field *save_var = hist_data->save_vars[i]->var; u64 val; save_var_idx = save_var->var.idx; @@ -3392,64 +3675,81 @@ static void onmax_print(struct seq_file *m, } } -static void onmax_save(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, - struct action_data *data, u64 *var_ref_vals) +static void ontrack_action(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) { - unsigned int max_idx = data->onmax.max_var->var.idx; - unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx; - - u64 var_val, max_val; + u64 var_val = var_ref_vals[data->track_data.var_ref->var_ref_idx]; - var_val = var_ref_vals[max_var_ref_idx]; - max_val = tracing_map_read_var(elt, max_idx); - - if (var_val <= max_val) - return; - - tracing_map_set_var(elt, max_idx, var_val); - - update_max_vars(hist_data, elt, rbe, rec); + if (check_track_val(elt, data, var_val)) { + save_track_val(hist_data, elt, data, var_val); + save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + } } -static void onmax_destroy(struct action_data *data) +static void action_data_destroy(struct action_data *data) { unsigned int i; - destroy_hist_field(data->onmax.max_var, 0); - destroy_hist_field(data->onmax.var, 0); + lockdep_assert_held(&event_mutex); - kfree(data->onmax.var_str); - kfree(data->onmax.fn_name); + kfree(data->action_name); for (i = 0; i < data->n_params; i++) kfree(data->params[i]); + if (data->synth_event) + data->synth_event->ref--; + + kfree(data->synth_event_name); + kfree(data); } -static int onmax_create(struct hist_trigger_data *hist_data, - struct action_data *data) +static void track_data_destroy(struct hist_trigger_data *hist_data, + struct action_data *data) { struct trace_event_file *file = hist_data->event_file; - struct hist_field *var_field, *ref_field, *max_var; - unsigned int var_ref_idx = hist_data->n_var_refs; - struct field_var *field_var; - char *onmax_var_str, *param; - unsigned int i; + + destroy_hist_field(data->track_data.track_var, 0); + + if (data->action == ACTION_SNAPSHOT) { + struct track_data *track_data; + + track_data = tracing_cond_snapshot_data(file->tr); + if (track_data && track_data->hist_data == hist_data) { + tracing_snapshot_cond_disable(file->tr); + track_data_free(track_data); + } + } + + kfree(data->track_data.var_str); + + action_data_destroy(data); +} + +static int action_create(struct hist_trigger_data *hist_data, + struct action_data *data); + +static int track_data_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct hist_field *var_field, *ref_field, *track_var = NULL; + struct trace_event_file *file = hist_data->event_file; + char *track_data_var_str; int ret = 0; - onmax_var_str = data->onmax.var_str; - if (onmax_var_str[0] != '$') { - hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str); + track_data_var_str = data->track_data.var_str; + if (track_data_var_str[0] != '$') { + hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str); return -EINVAL; } - onmax_var_str++; + track_data_var_str++; - var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str); + var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); if (!var_field) { - hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str); + hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str); return -EINVAL; } @@ -3457,39 +3757,26 @@ static int onmax_create(struct hist_trigger_data *hist_data, if (!ref_field) return -ENOMEM; - data->onmax.var = ref_field; + data->track_data.var_ref = ref_field; - data->fn = onmax_save; - data->onmax.max_var_ref_idx = var_ref_idx; - max_var = create_var(hist_data, file, "max", sizeof(u64), "u64"); - if (IS_ERR(max_var)) { - hist_err("onmax: Couldn't create onmax variable: ", "max"); - ret = PTR_ERR(max_var); + if (data->handler == HANDLER_ONMAX) + track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); + if (IS_ERR(track_var)) { + hist_err("Couldn't create onmax variable: ", "__max"); + ret = PTR_ERR(track_var); goto out; } - data->onmax.max_var = max_var; - - for (i = 0; i < data->n_params; i++) { - param = kstrdup(data->params[i], GFP_KERNEL); - if (!param) { - ret = -ENOMEM; - goto out; - } - field_var = create_target_field_var(hist_data, NULL, NULL, param); - if (IS_ERR(field_var)) { - hist_err("onmax: Couldn't create field variable: ", param); - ret = PTR_ERR(field_var); - kfree(param); - goto out; - } - - hist_data->max_vars[hist_data->n_max_vars++] = field_var; - if (field_var->val->flags & HIST_FIELD_FL_STRING) - hist_data->n_max_var_str++; - - kfree(param); + if (data->handler == HANDLER_ONCHANGE) + track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); + if (IS_ERR(track_var)) { + hist_err("Couldn't create onchange variable: ", "__change"); + ret = PTR_ERR(track_var); + goto out; } + data->track_data.track_var = track_var; + + ret = action_create(hist_data, data); out: return ret; } @@ -3497,14 +3784,18 @@ static int onmax_create(struct hist_trigger_data *hist_data, static int parse_action_params(char *params, struct action_data *data) { char *param, *saved_param; + bool first_param = true; int ret = 0; while (params) { - if (data->n_params >= SYNTH_FIELDS_MAX) + if (data->n_params >= SYNTH_FIELDS_MAX) { + hist_err("Too many action params", ""); goto out; + } param = strsep(¶ms, ","); if (!param) { + hist_err("No action param found", ""); ret = -EINVAL; goto out; } @@ -3522,86 +3813,164 @@ static int parse_action_params(char *params, struct action_data *data) goto out; } + if (first_param && data->use_trace_keyword) { + data->synth_event_name = saved_param; + first_param = false; + continue; + } + first_param = false; + data->params[data->n_params++] = saved_param; } out: return ret; } -static struct action_data *onmax_parse(char *str) +static int action_parse(char *str, struct action_data *data, + enum handler_id handler) { - char *onmax_fn_name, *onmax_var_str; - struct action_data *data; - int ret = -EINVAL; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return ERR_PTR(-ENOMEM); + char *action_name; + int ret = 0; - onmax_var_str = strsep(&str, ")"); - if (!onmax_var_str || !str) { + strsep(&str, "."); + if (!str) { + hist_err("action parsing: No action found", ""); ret = -EINVAL; - goto free; + goto out; } - data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL); - if (!data->onmax.var_str) { - ret = -ENOMEM; - goto free; + action_name = strsep(&str, "("); + if (!action_name || !str) { + hist_err("action parsing: No action found", ""); + ret = -EINVAL; + goto out; } - strsep(&str, "."); - if (!str) - goto free; - - onmax_fn_name = strsep(&str, "("); - if (!onmax_fn_name || !str) - goto free; - - if (str_has_prefix(onmax_fn_name, "save")) { + if (str_has_prefix(action_name, "save")) { char *params = strsep(&str, ")"); if (!params) { + hist_err("action parsing: No params found for %s", "save"); ret = -EINVAL; - goto free; + goto out; } ret = parse_action_params(params, data); if (ret) - goto free; - } else + goto out; + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + else { + hist_err("action parsing: Handler doesn't support action: ", action_name); + ret = -EINVAL; + goto out; + } + + data->track_data.save_data = save_track_data_vars; + data->fn = ontrack_action; + data->action = ACTION_SAVE; + } else if (str_has_prefix(action_name, "snapshot")) { + char *params = strsep(&str, ")"); + + if (!str) { + hist_err("action parsing: No closing paren found: %s", params); + ret = -EINVAL; + goto out; + } + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + else { + hist_err("action parsing: Handler doesn't support action: ", action_name); + ret = -EINVAL; + goto out; + } + + data->track_data.save_data = save_track_data_snapshot; + data->fn = ontrack_action; + data->action = ACTION_SNAPSHOT; + } else { + char *params = strsep(&str, ")"); + + if (str_has_prefix(action_name, "trace")) + data->use_trace_keyword = true; + + if (params) { + ret = parse_action_params(params, data); + if (ret) + goto out; + } + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + + if (handler != HANDLER_ONMATCH) { + data->track_data.save_data = action_trace; + data->fn = ontrack_action; + } else + data->fn = action_trace; + + data->action = ACTION_TRACE; + } + + data->action_name = kstrdup(action_name, GFP_KERNEL); + if (!data->action_name) { + ret = -ENOMEM; + goto out; + } + + data->handler = handler; + out: + return ret; +} + +static struct action_data *track_data_parse(struct hist_trigger_data *hist_data, + char *str, enum handler_id handler) +{ + struct action_data *data; + int ret = -EINVAL; + char *var_str; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + var_str = strsep(&str, ")"); + if (!var_str || !str) { + ret = -EINVAL; goto free; + } - data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL); - if (!data->onmax.fn_name) { + data->track_data.var_str = kstrdup(var_str, GFP_KERNEL); + if (!data->track_data.var_str) { ret = -ENOMEM; goto free; } + + ret = action_parse(str, data, handler); + if (ret) + goto free; out: return data; free: - onmax_destroy(data); + track_data_destroy(hist_data, data); data = ERR_PTR(ret); goto out; } static void onmatch_destroy(struct action_data *data) { - unsigned int i; - - lockdep_assert_held(&event_mutex); + kfree(data->match_data.event); + kfree(data->match_data.event_system); - kfree(data->onmatch.match_event); - kfree(data->onmatch.match_event_system); - kfree(data->onmatch.synth_event_name); - - for (i = 0; i < data->n_params; i++) - kfree(data->params[i]); - - if (data->onmatch.synth_event) - data->onmatch.synth_event->ref--; - - kfree(data); + action_data_destroy(data); } static void destroy_field_var(struct field_var *field_var) @@ -3651,8 +4020,9 @@ static int check_synth_field(struct synth_event *event, } static struct hist_field * -onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, - char *system, char *event, char *var) +trace_action_find_var(struct hist_trigger_data *hist_data, + struct action_data *data, + char *system, char *event, char *var) { struct hist_field *hist_field; @@ -3660,24 +4030,24 @@ onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, hist_field = find_target_event_var(hist_data, system, event, var); if (!hist_field) { - if (!system) { - system = data->onmatch.match_event_system; - event = data->onmatch.match_event; + if (!system && data->handler == HANDLER_ONMATCH) { + system = data->match_data.event_system; + event = data->match_data.event; } hist_field = find_event_var(hist_data, system, event, var); } if (!hist_field) - hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var); + hist_err_event("trace action: Couldn't find param: $", system, event, var); return hist_field; } static struct hist_field * -onmatch_create_field_var(struct hist_trigger_data *hist_data, - struct action_data *data, char *system, - char *event, char *var) +trace_action_create_field_var(struct hist_trigger_data *hist_data, + struct action_data *data, char *system, + char *event, char *var) { struct hist_field *hist_field = NULL; struct field_var *field_var; @@ -3700,9 +4070,9 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data, * looking for fields on the onmatch(system.event.xxx) * event. */ - if (!system) { - system = data->onmatch.match_event_system; - event = data->onmatch.match_event; + if (!system && data->handler == HANDLER_ONMATCH) { + system = data->match_data.event_system; + event = data->match_data.event; } /* @@ -3724,24 +4094,30 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data, goto out; } -static int onmatch_create(struct hist_trigger_data *hist_data, - struct trace_event_file *file, - struct action_data *data) +static int trace_action_create(struct hist_trigger_data *hist_data, + struct action_data *data) { char *event_name, *param, *system = NULL; struct hist_field *hist_field, *var_ref; unsigned int i, var_ref_idx; unsigned int field_pos = 0; struct synth_event *event; + char *synth_event_name; int ret = 0; lockdep_assert_held(&event_mutex); - event = find_synth_event(data->onmatch.synth_event_name); + if (data->use_trace_keyword) + synth_event_name = data->synth_event_name; + else + synth_event_name = data->action_name; + + event = find_synth_event(synth_event_name); if (!event) { - hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name); + hist_err("trace action: Couldn't find synthetic event: ", synth_event_name); return -EINVAL; } + event->ref++; var_ref_idx = hist_data->n_var_refs; @@ -3769,13 +4145,15 @@ static int onmatch_create(struct hist_trigger_data *hist_data, } if (param[0] == '$') - hist_field = onmatch_find_var(hist_data, data, system, - event_name, param); + hist_field = trace_action_find_var(hist_data, data, + system, event_name, + param); else - hist_field = onmatch_create_field_var(hist_data, data, - system, - event_name, - param); + hist_field = trace_action_create_field_var(hist_data, + data, + system, + event_name, + param); if (!hist_field) { kfree(p); @@ -3797,7 +4175,7 @@ static int onmatch_create(struct hist_trigger_data *hist_data, continue; } - hist_err_event("onmatch: Param type doesn't match synthetic event field type: ", + hist_err_event("trace action: Param type doesn't match synthetic event field type: ", system, event_name, param); kfree(p); ret = -EINVAL; @@ -3805,14 +4183,13 @@ static int onmatch_create(struct hist_trigger_data *hist_data, } if (field_pos != event->n_fields) { - hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name); + hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name); ret = -EINVAL; goto err; } - data->fn = action_trace; - data->onmatch.synth_event = event; - data->onmatch.var_ref_idx = var_ref_idx; + data->synth_event = event; + data->var_ref_idx = var_ref_idx; out: return ret; err: @@ -3821,10 +4198,75 @@ static int onmatch_create(struct hist_trigger_data *hist_data, goto out; } +static int action_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct trace_event_file *file = hist_data->event_file; + struct track_data *track_data; + struct field_var *field_var; + unsigned int i; + char *param; + int ret = 0; + + if (data->action == ACTION_TRACE) + return trace_action_create(hist_data, data); + + if (data->action == ACTION_SNAPSHOT) { + track_data = track_data_alloc(hist_data->key_size, data, hist_data); + if (IS_ERR(track_data)) { + ret = PTR_ERR(track_data); + goto out; + } + + ret = tracing_snapshot_cond_enable(file->tr, track_data, + cond_snapshot_update); + if (ret) + track_data_free(track_data); + + goto out; + } + + if (data->action == ACTION_SAVE) { + if (hist_data->n_save_vars) { + ret = -EEXIST; + hist_err("save action: Can't have more than one save() action per hist", ""); + goto out; + } + + for (i = 0; i < data->n_params; i++) { + param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto out; + } + + field_var = create_target_field_var(hist_data, NULL, NULL, param); + if (IS_ERR(field_var)) { + hist_err("save action: Couldn't create field variable: ", param); + ret = PTR_ERR(field_var); + kfree(param); + goto out; + } + + hist_data->save_vars[hist_data->n_save_vars++] = field_var; + if (field_var->val->flags & HIST_FIELD_FL_STRING) + hist_data->n_save_var_str++; + kfree(param); + } + } + out: + return ret; +} + +static int onmatch_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + return action_create(hist_data, data); +} + static struct action_data *onmatch_parse(struct trace_array *tr, char *str) { char *match_event, *match_event_system; - char *synth_event_name, *params; struct action_data *data; int ret = -EINVAL; @@ -3850,43 +4292,19 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) goto free; } - data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL); - if (!data->onmatch.match_event) { - ret = -ENOMEM; - goto free; - } - - data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL); - if (!data->onmatch.match_event_system) { + data->match_data.event = kstrdup(match_event, GFP_KERNEL); + if (!data->match_data.event) { ret = -ENOMEM; goto free; } - strsep(&str, "."); - if (!str) { - hist_err("onmatch: Missing . after onmatch(): ", str); - goto free; - } - - synth_event_name = strsep(&str, "("); - if (!synth_event_name || !str) { - hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name); - goto free; - } - - data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL); - if (!data->onmatch.synth_event_name) { + data->match_data.event_system = kstrdup(match_event_system, GFP_KERNEL); + if (!data->match_data.event_system) { ret = -ENOMEM; goto free; } - params = strsep(&str, ")"); - if (!params || !str || (str && strlen(str))) { - hist_err("onmatch: Missing closing paramlist paren: ", params); - goto free; - } - - ret = parse_action_params(params, data); + ret = action_parse(str, data, HANDLER_ONMATCH); if (ret) goto free; out: @@ -4326,10 +4744,11 @@ static void destroy_actions(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) + if (data->handler == HANDLER_ONMATCH) onmatch_destroy(data); - else if (data->fn == onmax_save) - onmax_destroy(data); + else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + track_data_destroy(hist_data, data); else kfree(data); } @@ -4355,16 +4774,24 @@ static int parse_actions(struct hist_trigger_data *hist_data) ret = PTR_ERR(data); break; } - data->fn = action_trace; } else if ((len = str_has_prefix(str, "onmax("))) { char *action_str = str + len; - data = onmax_parse(action_str); + data = track_data_parse(hist_data, action_str, + HANDLER_ONMAX); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + } else if ((len = str_has_prefix(str, "onchange("))) { + char *action_str = str + len; + + data = track_data_parse(hist_data, action_str, + HANDLER_ONCHANGE); if (IS_ERR(data)) { ret = PTR_ERR(data); break; } - data->fn = onmax_save; } else { ret = -EINVAL; break; @@ -4376,8 +4803,7 @@ static int parse_actions(struct hist_trigger_data *hist_data) return ret; } -static int create_actions(struct hist_trigger_data *hist_data, - struct trace_event_file *file) +static int create_actions(struct hist_trigger_data *hist_data) { struct action_data *data; unsigned int i; @@ -4386,14 +4812,18 @@ static int create_actions(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->attrs->n_actions; i++) { data = hist_data->actions[i]; - if (data->fn == action_trace) { - ret = onmatch_create(hist_data, file, data); + if (data->handler == HANDLER_ONMATCH) { + ret = onmatch_create(hist_data, data); if (ret) - return ret; - } else if (data->fn == onmax_save) { - ret = onmax_create(hist_data, data); + break; + } else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + ret = track_data_create(hist_data, data); if (ret) - return ret; + break; + } else { + ret = -EINVAL; + break; } } @@ -4409,26 +4839,51 @@ static void print_actions(struct seq_file *m, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == onmax_save) - onmax_print(m, hist_data, elt, data); + if (data->action == ACTION_SNAPSHOT) + continue; + + if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + track_data_print(m, hist_data, elt, data); } } -static void print_onmax_spec(struct seq_file *m, - struct hist_trigger_data *hist_data, - struct action_data *data) +static void print_action_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) { unsigned int i; - seq_puts(m, ":onmax("); - seq_printf(m, "%s", data->onmax.var_str); - seq_printf(m, ").%s(", data->onmax.fn_name); - - for (i = 0; i < hist_data->n_max_vars; i++) { - seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name); - if (i < hist_data->n_max_vars - 1) - seq_puts(m, ","); + if (data->action == ACTION_SAVE) { + for (i = 0; i < hist_data->n_save_vars; i++) { + seq_printf(m, "%s", hist_data->save_vars[i]->var->var.name); + if (i < hist_data->n_save_vars - 1) + seq_puts(m, ","); + } + } else if (data->action == ACTION_TRACE) { + if (data->use_trace_keyword) + seq_printf(m, "%s", data->synth_event_name); + for (i = 0; i < data->n_params; i++) { + if (i || data->use_trace_keyword) + seq_puts(m, ","); + seq_printf(m, "%s", data->params[i]); + } } +} + +static void print_track_data_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + if (data->handler == HANDLER_ONMAX) + seq_puts(m, ":onmax("); + else if (data->handler == HANDLER_ONCHANGE) + seq_puts(m, ":onchange("); + seq_printf(m, "%s", data->track_data.var_str); + seq_printf(m, ").%s(", data->action_name); + + print_action_spec(m, hist_data, data); + seq_puts(m, ")"); } @@ -4436,18 +4891,12 @@ static void print_onmatch_spec(struct seq_file *m, struct hist_trigger_data *hist_data, struct action_data *data) { - unsigned int i; + seq_printf(m, ":onmatch(%s.%s).", data->match_data.event_system, + data->match_data.event); - seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system, - data->onmatch.match_event); + seq_printf(m, "%s(", data->action_name); - seq_printf(m, "%s(", data->onmatch.synth_event->name); - - for (i = 0; i < data->n_params; i++) { - if (i) - seq_puts(m, ","); - seq_printf(m, "%s", data->params[i]); - } + print_action_spec(m, hist_data, data); seq_puts(m, ")"); } @@ -4463,8 +4912,11 @@ static bool actions_match(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; struct action_data *data_test = hist_data_test->actions[i]; + char *action_name, *action_name_test; - if (data->fn != data_test->fn) + if (data->handler != data_test->handler) + return false; + if (data->action != data_test->action) return false; if (data->n_params != data_test->n_params) @@ -4475,22 +4927,30 @@ static bool actions_match(struct hist_trigger_data *hist_data, return false; } - if (data->fn == action_trace) { - if (strcmp(data->onmatch.synth_event_name, - data_test->onmatch.synth_event_name) != 0) - return false; - if (strcmp(data->onmatch.match_event_system, - data_test->onmatch.match_event_system) != 0) - return false; - if (strcmp(data->onmatch.match_event, - data_test->onmatch.match_event) != 0) + if (data->use_trace_keyword) + action_name = data->synth_event_name; + else + action_name = data->action_name; + + if (data_test->use_trace_keyword) + action_name_test = data_test->synth_event_name; + else + action_name_test = data_test->action_name; + + if (strcmp(action_name, action_name_test) != 0) + return false; + + if (data->handler == HANDLER_ONMATCH) { + if (strcmp(data->match_data.event_system, + data_test->match_data.event_system) != 0) return false; - } else if (data->fn == onmax_save) { - if (strcmp(data->onmax.var_str, - data_test->onmax.var_str) != 0) + if (strcmp(data->match_data.event, + data_test->match_data.event) != 0) return false; - if (strcmp(data->onmax.fn_name, - data_test->onmax.fn_name) != 0) + } else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + if (strcmp(data->track_data.var_str, + data_test->track_data.var_str) != 0) return false; } } @@ -4507,10 +4967,11 @@ static void print_actions_spec(struct seq_file *m, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) + if (data->handler == HANDLER_ONMATCH) print_onmatch_spec(m, hist_data, data); - else if (data->fn == onmax_save) - print_onmax_spec(m, hist_data, data); + else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + print_track_data_spec(m, hist_data, data); } } @@ -4695,22 +5156,24 @@ static inline void add_to_key(char *compound_key, void *key, /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - } - memcpy(compound_key + key_field->offset, key, size); + strncpy(compound_key + key_field->offset, (char *)key, size); + } else + memcpy(compound_key + key_field->offset, key, size); } static void hist_trigger_actions(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, u64 *var_ref_vals) + struct ring_buffer_event *rbe, void *key, + u64 *var_ref_vals) { struct action_data *data; unsigned int i; for (i = 0; i < hist_data->n_actions; i++) { data = hist_data->actions[i]; - data->fn(hist_data, elt, rec, rbe, data, var_ref_vals); + data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals); } } @@ -4723,7 +5186,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, u64 var_ref_vals[TRACING_MAP_VARS_MAX]; char compound_key[HIST_KEY_SIZE_MAX]; struct tracing_map_elt *elt = NULL; - struct stack_trace stacktrace; struct hist_field *key_field; u64 field_contents; void *key = NULL; @@ -4735,14 +5197,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { - stacktrace.max_entries = HIST_STACKTRACE_DEPTH; - stacktrace.entries = entries; - stacktrace.nr_entries = 0; - stacktrace.skip = HIST_STACKTRACE_SKIP; - - memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE); - save_stack_trace(&stacktrace); - + memset(entries, 0, HIST_STACKTRACE_SIZE); + stack_trace_save(entries, HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); key = entries; } else { field_contents = key_field->fn(key_field, elt, rbe, rec); @@ -4771,7 +5228,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals); + hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -4783,7 +5240,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, unsigned int i; for (i = 0; i < max_entries; i++) { - if (stacktrace_entries[i] == ULONG_MAX) + if (!stacktrace_entries[i]) return; seq_printf(m, "%*c", 1 + spaces, ' '); @@ -4792,10 +5249,10 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, } } -static void -hist_trigger_entry_print(struct seq_file *m, - struct hist_trigger_data *hist_data, void *key, - struct tracing_map_elt *elt) +static void hist_trigger_print_key(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt) { struct hist_field *key_field; char str[KSYM_SYMBOL_LEN]; @@ -4871,6 +5328,17 @@ hist_trigger_entry_print(struct seq_file *m, seq_puts(m, " "); seq_puts(m, "}"); +} + +static void hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt) +{ + const char *field_name; + unsigned int i; + + hist_trigger_print_key(m, hist_data, key, elt); seq_printf(m, " hitcount: %10llu", tracing_map_read_sum(elt, HITCOUNT_IDX)); @@ -4937,6 +5405,8 @@ static void hist_trigger_show(struct seq_file *m, if (n_entries < 0) n_entries = 0; + track_data_snapshot_print(m, hist_data); + seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", (u64)atomic64_read(&hist_data->map->hits), n_entries, (u64)atomic64_read(&hist_data->map->drops)); @@ -5683,7 +6153,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, if (has_hist_vars(hist_data)) save_hist_vars(hist_data); - ret = create_actions(hist_data, file); + ret = create_actions(hist_data); if (ret) goto out_unreg; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index c2af1560e856..69ebf3c2f1b5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -380,6 +380,7 @@ static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { trace_seq_putc(s, ' '); trace_print_lat_fmt(s, entry); + trace_seq_puts(s, " | "); } /* If the pid changed since the last trace, output this event */ @@ -501,6 +502,17 @@ static void print_graph_abs_time(u64 t, struct trace_seq *s) } static void +print_graph_rel_time(struct trace_iterator *iter, struct trace_seq *s) +{ + unsigned long long usecs; + + usecs = iter->ts - iter->trace_buffer->time_start; + do_div(usecs, NSEC_PER_USEC); + + trace_seq_printf(s, "%9llu us | ", usecs); +} + +static void print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid, u32 flags) { @@ -517,6 +529,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); + /* Relative time */ + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + print_graph_rel_time(iter, s); + /* Cpu */ if (flags & TRACE_GRAPH_PRINT_CPU) print_graph_cpu(s, cpu); @@ -725,6 +741,10 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); + /* Relative time */ + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + print_graph_rel_time(iter, s); + /* Cpu */ if (flags & TRACE_GRAPH_PRINT_CPU) print_graph_cpu(s, cpu); @@ -1101,6 +1121,8 @@ static void print_lat_header(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_ABS_TIME) size += 16; + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + size += 16; if (flags & TRACE_GRAPH_PRINT_CPU) size += 4; if (flags & TRACE_GRAPH_PRINT_PROC) @@ -1125,12 +1147,14 @@ static void __print_graph_headers_flags(struct trace_array *tr, seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_puts(s, " TIME "); + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + seq_puts(s, " REL TIME "); if (flags & TRACE_GRAPH_PRINT_CPU) seq_puts(s, " CPU"); if (flags & TRACE_GRAPH_PRINT_PROC) seq_puts(s, " TASK/PID "); if (lat) - seq_puts(s, "||||"); + seq_puts(s, "|||| "); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_puts(s, " DURATION "); seq_puts(s, " FUNCTION CALLS\n"); @@ -1139,12 +1163,14 @@ static void __print_graph_headers_flags(struct trace_array *tr, seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_puts(s, " | "); + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_CPU) seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_PROC) seq_puts(s, " | | "); if (lat) - seq_puts(s, "||||"); + seq_puts(s, "|||| "); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_puts(s, " | | "); seq_puts(s, " | | | |\n"); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d3294721f119..a745b0cee5d3 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -14,6 +14,7 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include "trace.h" @@ -238,7 +239,7 @@ static void irqsoff_trace_close(struct trace_iterator *iter) #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ TRACE_GRAPH_PRINT_PROC | \ - TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_REL_TIME | \ TRACE_GRAPH_PRINT_DURATION) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) @@ -365,7 +366,7 @@ out: __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); } -static inline void +static nokprobe_inline void start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; @@ -401,7 +402,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) atomic_dec(&data->disabled); } -static inline void +static nokprobe_inline void stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; @@ -443,6 +444,7 @@ void start_critical_timings(void) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(start_critical_timings); +NOKPROBE_SYMBOL(start_critical_timings); void stop_critical_timings(void) { @@ -452,6 +454,7 @@ void stop_critical_timings(void) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(stop_critical_timings); +NOKPROBE_SYMBOL(stop_critical_timings); #ifdef CONFIG_FUNCTION_TRACER static bool function_enabled; @@ -611,6 +614,7 @@ void tracer_hardirqs_on(unsigned long a0, unsigned long a1) if (!preempt_trace(pc) && irq_trace()) stop_critical_timing(a0, a1, pc); } +NOKPROBE_SYMBOL(tracer_hardirqs_on); void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { @@ -619,6 +623,7 @@ void tracer_hardirqs_off(unsigned long a0, unsigned long a1) if (!preempt_trace(pc) && irq_trace()) start_critical_timing(a0, a1, pc); } +NOKPROBE_SYMBOL(tracer_hardirqs_off); static int irqsoff_tracer_init(struct trace_array *tr) { diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index d953c163a079..810d78a8d14c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -51,14 +51,16 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu_file, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d5fb09ebba8b..5d5129b05df7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,7 +35,7 @@ static struct dyn_event_operations trace_kprobe_ops = { .match = trace_kprobe_match, }; -/** +/* * Kprobe event core functions */ struct trace_kprobe { @@ -221,7 +221,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, tk->rp.maxactive = maxactive; - if (!event || !is_good_name(event)) { + if (!event || !group) { ret = -EINVAL; goto error; } @@ -231,11 +231,6 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk->tp.call.name) goto error; - if (!group || !is_good_name(group)) { - ret = -EINVAL; - goto error; - } - tk->tp.class.system = kstrdup(group, GFP_KERNEL); if (!tk->tp.class.system) goto error; @@ -624,7 +619,11 @@ static int trace_kprobe_create(int argc, const char *argv[]) if (event) event++; - if (is_return && isdigit(argv[0][1])) { + if (isdigit(argv[0][1])) { + if (!is_return) { + pr_info("Maxactive is not for kprobe"); + return -EINVAL; + } if (event) len = event - &argv[0][1] - 1; else @@ -634,8 +633,8 @@ static int trace_kprobe_create(int argc, const char *argv[]) memcpy(buf, &argv[0][1], len); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); - if (ret) { - pr_info("Failed to parse maxactive.\n"); + if (ret || !maxactive) { + pr_info("Invalid maxactive number\n"); return ret; } /* kretprobes instances are iterated over via a list. The @@ -694,9 +693,9 @@ static int trace_kprobe_create(int argc, const char *argv[]) tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); if (IS_ERR(tk)) { - pr_info("Failed to allocate trace_probe.(%d)\n", - (int)PTR_ERR(tk)); ret = PTR_ERR(tk); + /* This must return -ENOMEM otherwise there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); goto out; } @@ -861,22 +860,14 @@ static const struct file_operations kprobe_profile_ops = { static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - mm_segment_t old_fs; int ret, len = 0; u8 c; - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + ret = probe_kernel_read(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); - pagefault_enable(); - set_fs(old_fs); - return (ret < 0) ? ret : len; } diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 71f553cceb3c..4d8e99fdbbbe 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -9,6 +9,7 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include "trace.h" #define CREATE_TRACE_POINTS @@ -30,6 +31,7 @@ void trace_hardirqs_on(void) lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); +NOKPROBE_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { @@ -43,6 +45,7 @@ void trace_hardirqs_off(void) lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); +NOKPROBE_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { @@ -56,6 +59,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); +NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { @@ -69,6 +73,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr) lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); +NOKPROBE_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */ #ifdef CONFIG_TRACE_PREEMPT_TOGGLE diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9962cb5da8ac..8f8411e7835f 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,7 +13,7 @@ #include "trace_probe.h" -const char *reserved_field_names[] = { +static const char *reserved_field_names[] = { "common_type", "common_flags", "common_preempt_count", @@ -159,6 +159,7 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf) { const char *slash, *event = *pevent; + int len; slash = strchr(event, '/'); if (slash) { @@ -171,12 +172,25 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, return -E2BIG; } strlcpy(buf, event, slash - event + 1); + if (!is_good_name(buf)) { + pr_info("Group name must follow the same rules as C identifiers\n"); + return -EINVAL; + } *pgroup = buf; *pevent = slash + 1; + event = *pevent; } - if (strlen(event) == 0) { + len = strlen(event); + if (len == 0) { pr_info("Event name is not specified\n"); return -EINVAL; + } else if (len > MAX_EVENT_NAME_LEN) { + pr_info("Event name is too long\n"); + return -E2BIG; + } + if (!is_good_name(event)) { + pr_info("Event name must follow the same rules as C identifiers\n"); + return -EINVAL; } return 0; } @@ -300,6 +314,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, case '+': /* deref memory */ arg++; /* Skip '+', because kstrtol() rejects it. */ + /* fall through */ case '-': tmp = strchr(arg, '('); if (!tmp) @@ -547,6 +562,8 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, body = strchr(arg, '='); if (body) { + if (body - arg > MAX_ARG_NAME_LEN || body == arg) + return -EINVAL; parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8a63f8bc01bc..2177c206de15 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -32,6 +32,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 #define MAX_ARRAY_LEN 64 +#define MAX_ARG_NAME_LEN 32 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 5c56afc17cf8..4737bb8c07a3 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -180,10 +180,12 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, regs, dl, base); - if (unlikely(ret < 0 && arg->dynamic)) + if (unlikely(ret < 0 && arg->dynamic)) { *dl = make_data_loc(0, dyndata - base); - else + } else { dyndata += ret; + maxlen -= ret; + } } } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4ea7e6845efb..743b2b520d34 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -180,8 +180,11 @@ static void wakeup_trace_close(struct trace_iterator *iter) } #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ - TRACE_GRAPH_PRINT_ABS_TIME | \ - TRACE_GRAPH_PRINT_DURATION) + TRACE_GRAPH_PRINT_CPU | \ + TRACE_GRAPH_PRINT_REL_TIME | \ + TRACE_GRAPH_PRINT_DURATION | \ + TRACE_GRAPH_PRINT_OVERHEAD | \ + TRACE_GRAPH_PRINT_IRQS) static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { @@ -472,6 +475,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); + __trace_stack(wakeup_trace, flags, 0, pc); T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); @@ -482,7 +486,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, if (likely(!is_tracing_stopped())) { wakeup_trace->max_latency = delta; - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu, NULL); } out_unlock: @@ -583,6 +587,7 @@ probe_wakeup(void *ignore, struct task_struct *p) data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); + __trace_stack(wakeup_trace, flags, 0, pc); /* * We must be careful in using CALLER_ADDR2. But since wake_up diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index eec648a0d673..5d16f73898db 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,44 +18,32 @@ #include "trace.h" -static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = - { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; -unsigned stack_trace_index[STACK_TRACE_ENTRIES]; +#define STACK_TRACE_ENTRIES 500 -/* - * Reserve one entry for the passed in ip. This will allow - * us to remove most or all of the stack size overhead - * added by the stack tracer itself. - */ -struct stack_trace stack_trace_max = { - .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[0], -}; +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; +static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; -unsigned long stack_trace_max_size; -arch_spinlock_t stack_trace_max_lock = +static unsigned int stack_trace_nr_entries; +static unsigned long stack_trace_max_size; +static arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; -static int last_stack_tracer_enabled; -void stack_trace_print(void) +static void print_max_stack(void) { long i; int size; pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - stack_trace_max.nr_entries); + stack_trace_nr_entries); - for (i = 0; i < stack_trace_max.nr_entries; i++) { - if (stack_dump_trace[i] == ULONG_MAX) - break; - if (i+1 == stack_trace_max.nr_entries || - stack_dump_trace[i+1] == ULONG_MAX) + for (i = 0; i < stack_trace_nr_entries; i++) { + if (i + 1 == stack_trace_nr_entries) size = stack_trace_index[i]; else size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -65,16 +53,7 @@ void stack_trace_print(void) } } -/* - * When arch-specific code overrides this function, the following - * data should be filled up, assuming stack_trace_max_lock is held to - * prevent concurrent updates. - * stack_trace_index[] - * stack_trace_max - * stack_trace_max_size - */ -void __weak -check_stack(unsigned long ip, unsigned long *stack) +static void check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; @@ -110,13 +89,12 @@ check_stack(unsigned long ip, unsigned long *stack) stack_trace_max_size = this_size; - stack_trace_max.nr_entries = 0; - stack_trace_max.skip = 0; - - save_stack_trace(&stack_trace_max); + stack_trace_nr_entries = stack_trace_save(stack_dump_trace, + ARRAY_SIZE(stack_dump_trace) - 1, + 0); /* Skip over the overhead of the stack tracer itself */ - for (i = 0; i < stack_trace_max.nr_entries; i++) { + for (i = 0; i < stack_trace_nr_entries; i++) { if (stack_dump_trace[i] == ip) break; } @@ -125,7 +103,7 @@ check_stack(unsigned long ip, unsigned long *stack) * Some archs may not have the passed in ip in the dump. * If that happens, we need to show everything. */ - if (i == stack_trace_max.nr_entries) + if (i == stack_trace_nr_entries) i = 0; /* @@ -143,15 +121,13 @@ check_stack(unsigned long ip, unsigned long *stack) * loop will only happen once. This code only takes place * on a new max, so it is far from a fast path. */ - while (i < stack_trace_max.nr_entries) { + while (i < stack_trace_nr_entries) { int found = 0; stack_trace_index[x] = this_size; p = start; - for (; p < top && i < stack_trace_max.nr_entries; p++) { - if (stack_dump_trace[i] == ULONG_MAX) - break; + for (; p < top && i < stack_trace_nr_entries; p++) { /* * The READ_ONCE_NOCHECK is used to let KASAN know that * this is not a stack-out-of-bounds error. @@ -182,12 +158,10 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - stack_trace_max.nr_entries = x; - for (; x < i; x++) - stack_dump_trace[x] = ULONG_MAX; + stack_trace_nr_entries = x; if (task_stack_end_corrupted(current)) { - stack_trace_print(); + print_max_stack(); BUG(); } @@ -286,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n >= stack_trace_nr_entries) return NULL; m->private = (void *)n; @@ -350,7 +324,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - stack_trace_max.nr_entries); + stack_trace_nr_entries); if (!stack_tracer_enabled && !stack_trace_max_size) print_disabled(m); @@ -360,12 +334,10 @@ static int t_show(struct seq_file *m, void *v) i = *(long *)v; - if (i >= stack_trace_max.nr_entries || - stack_dump_trace[i] == ULONG_MAX) + if (i >= stack_trace_nr_entries) return 0; - if (i+1 == stack_trace_max.nr_entries || - stack_dump_trace[i+1] == ULONG_MAX) + if (i + 1 == stack_trace_nr_entries) size = stack_trace_index[i]; else size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -422,23 +394,21 @@ stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int was_enabled; int ret; mutex_lock(&stack_sysctl_mutex); + was_enabled = !!stack_tracer_enabled; ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write || - (last_stack_tracer_enabled == !!stack_tracer_enabled)) + if (ret || !write || (was_enabled == !!stack_tracer_enabled)) goto out; - last_stack_tracer_enabled = !!stack_tracer_enabled; - if (stack_tracer_enabled) register_ftrace_function(&trace_ops); else unregister_ftrace_function(&trace_ops); - out: mutex_unlock(&stack_sysctl_mutex); return ret; @@ -454,7 +424,6 @@ static __init int enable_stacktrace(char *str) strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); stack_tracer_enabled = 1; - last_stack_tracer_enabled = 1; return 1; } __setup("stacktrace", enable_stacktrace); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f93a56d2db27..fa8fbff736d6 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -314,6 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct ring_buffer_event *event; struct ring_buffer *buffer; unsigned long irq_flags; + unsigned long args[6]; int pc; int syscall_nr; int size; @@ -347,7 +348,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry = ring_buffer_event_data(event); entry->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); + syscall_get_arguments(current, regs, args); + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); event_trigger_unlock_commit(trace_file, buffer, event, entry, irq_flags, pc); @@ -583,6 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct hlist_head *head; + unsigned long args[6]; bool valid_prog_array; int syscall_nr; int rctx; @@ -613,8 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; rec->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, - (unsigned long *)&rec->args); + syscall_get_arguments(current, regs, args); + memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); if ((valid_prog_array && !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9bde07c06362..be78d99ee6bc 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -273,10 +273,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; - if (!event || !is_good_name(event)) - return ERR_PTR(-EINVAL); - - if (!group || !is_good_name(group)) + if (!event || !group) return ERR_PTR(-EINVAL); tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); @@ -524,8 +521,9 @@ static int trace_uprobe_create(int argc, const char **argv) tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { - pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); ret = PTR_ERR(tu); + /* This must return -ENOMEM otherwise there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); goto fail_address_parse; } tu->offset = offset; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 977918d5d350..7f9e7b9306fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -42,9 +42,9 @@ int __read_mostly watchdog_user_enabled = 1; int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; -int __read_mostly nmi_watchdog_available; +static int __read_mostly nmi_watchdog_available; -struct cpumask watchdog_allowed_mask __read_mostly; +static struct cpumask watchdog_allowed_mask __read_mostly; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -199,6 +199,13 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); +static int __init watchdog_thresh_setup(char *str) +{ + get_option(&str, &watchdog_thresh); + return 1; +} +__setup("watchdog_thresh=", watchdog_thresh_setup); + #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; @@ -547,13 +554,15 @@ static void softlockup_start_all(void) int lockup_detector_online_cpu(unsigned int cpu) { - watchdog_enable(cpu); + if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) + watchdog_enable(cpu); return 0; } int lockup_detector_offline_cpu(unsigned int cpu) { - watchdog_disable(cpu); + if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) + watchdog_disable(cpu); return 0; } @@ -581,7 +590,7 @@ static void lockup_detector_reconfigure(void) * Create the watchdog thread infrastructure and configure the detector(s). * * The threads are not unparked as watchdog_allowed_mask is empty. When - * the threads are sucessfully initialized, take the proper locks and + * the threads are successfully initialized, take the proper locks and * unpark the threads in the watchdog_cpumask if the watchdog is enabled. */ static __init void lockup_detector_setup(void) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 71381168dede..247bf0b1582c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; - pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", + this_cpu); print_modules(); print_irqtrace_events(current); if (regs) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fc5d23d752a5..faf7622246da 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -259,6 +259,8 @@ struct workqueue_struct { struct wq_device *wq_dev; /* I: for sysfs interface */ #endif #ifdef CONFIG_LOCKDEP + char *lock_name; + struct lock_class_key key; struct lockdep_map lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ @@ -646,7 +648,7 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, * The following mb guarantees that previous clear of a PENDING bit * will not be reordered with any speculative LOADS or STORES from * work->current_func, which is executed afterwards. This possible - * reordering can lead to a missed execution on attempt to qeueue + * reordering can lead to a missed execution on attempt to queue * the same @work. E.g. consider this case: * * CPU#0 CPU#1 @@ -839,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool) } /** - * wq_worker_waking_up - a worker is waking up + * wq_worker_running - a worker is running again * @task: task waking up - * @cpu: CPU @task is waking up to * - * This function is called during try_to_wake_up() when a worker is - * being awoken. - * - * CONTEXT: - * spin_lock_irq(rq->lock) + * This function is called when a worker returns from schedule() */ -void wq_worker_waking_up(struct task_struct *task, int cpu) +void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); - if (!(worker->flags & WORKER_NOT_RUNNING)) { - WARN_ON_ONCE(worker->pool->cpu != cpu); + if (!worker->sleeping) + return; + if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(&worker->pool->nr_running); - } + worker->sleeping = 0; } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep * - * This function is called during schedule() when a busy worker is - * going to sleep. Worker on the same cpu can be woken up by - * returning pointer to its task. - * - * CONTEXT: - * spin_lock_irq(rq->lock) - * - * Return: - * Worker task on @cpu to wake up, %NULL if none. + * This function is called from schedule() when a busy worker is + * going to sleep. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task) +void wq_worker_sleeping(struct task_struct *task) { - struct worker *worker = kthread_data(task), *to_wakeup = NULL; + struct worker *next, *worker = kthread_data(task); struct worker_pool *pool; /* @@ -884,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * checking NOT_RUNNING. */ if (worker->flags & WORKER_NOT_RUNNING) - return NULL; + return; pool = worker->pool; - /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) - return NULL; + if (WARN_ON_ONCE(worker->sleeping)) + return; + + worker->sleeping = 1; + spin_lock_irq(&pool->lock); /* * The counterpart of the following dec_and_test, implied mb, @@ -904,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * lock is safe. */ if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) - to_wakeup = first_idle_worker(pool); - return to_wakeup ? to_wakeup->task : NULL; + !list_empty(&pool->worklist)) { + next = first_idle_worker(pool); + if (next) + wake_up_process(next->task); + } + spin_unlock_irq(&pool->lock); } /** @@ -918,6 +914,16 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * CONTEXT: * spin_lock_irq(rq->lock) * + * This function is called during schedule() when a kworker is going + * to sleep. It's used by psi to identify aggregation workers during + * dequeuing, to allow periodic aggregation to shut-off when that + * worker is the last task in the system or cgroup to go to sleep. + * + * As this function doesn't involve any workqueue-related locking, it + * only returns stable values when called from inside the scheduler's + * queuing and dequeuing paths, when @task, which must be a kworker, + * is guaranteed to not be processing any works. + * * Return: * The last work function %current executed as a worker, NULL if it * hasn't executed any work yet. @@ -1341,7 +1347,7 @@ static bool is_chained_work(struct workqueue_struct *wq) worker = current_wq_worker(); /* - * Return %true iff I'm a worker execuing a work item on @wq. If + * Return %true iff I'm a worker executing a work item on @wq. If * I'm @worker, it's safe to dereference it without locking. */ return worker && worker->current_pwq->wq == wq; @@ -1512,6 +1518,90 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL(queue_work_on); +/** + * workqueue_select_cpu_near - Select a CPU based on NUMA node + * @node: NUMA node ID that we want to select a CPU from + * + * This function will attempt to find a "random" cpu available on a given + * node. If there are no CPUs available on the given node it will return + * WORK_CPU_UNBOUND indicating that we should just schedule to any + * available CPU if we need to schedule this work. + */ +static int workqueue_select_cpu_near(int node) +{ + int cpu; + + /* No point in doing this if NUMA isn't enabled for workqueues */ + if (!wq_numa_enabled) + return WORK_CPU_UNBOUND; + + /* Delay binding to CPU if node is not valid or online */ + if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) + return WORK_CPU_UNBOUND; + + /* Use local node/cpu if we are already there */ + cpu = raw_smp_processor_id(); + if (node == cpu_to_node(cpu)) + return cpu; + + /* Use "random" otherwise know as "first" online CPU of node */ + cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); + + /* If CPU is valid return that, otherwise just defer */ + return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; +} + +/** + * queue_work_node - queue work on a "random" cpu for a given NUMA node + * @node: NUMA node that we are targeting the work for + * @wq: workqueue to use + * @work: work to queue + * + * We queue the work to a "random" CPU within a given NUMA node. The basic + * idea here is to provide a way to somehow associate work with a given + * NUMA node. + * + * This function will only make a best effort attempt at getting this onto + * the right NUMA node. If no node is requested or the requested node is + * offline then we just fall back to standard queue_work behavior. + * + * Currently the "random" CPU ends up being the first available CPU in the + * intersection of cpu_online_mask and the cpumask of the node, unless we + * are running on the node. In that case we just use the current CPU. + * + * Return: %false if @work was already on a queue, %true otherwise. + */ +bool queue_work_node(int node, struct workqueue_struct *wq, + struct work_struct *work) +{ + unsigned long flags; + bool ret = false; + + /* + * This current implementation is specific to unbound workqueues. + * Specifically we only return the first available CPU for a given + * node instead of cycling through individual CPUs within the node. + * + * If this is used with a per-cpu workqueue then the logic in + * workqueue_select_cpu_near would need to be updated to allow for + * some round robin type logic. + */ + WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); + + local_irq_save(flags); + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + int cpu = workqueue_select_cpu_near(node); + + __queue_work(cpu, wq, work); + ret = true; + } + + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_work_node); + void delayed_work_timer_fn(struct timer_list *t) { struct delayed_work *dwork = from_timer(dwork, t, timer); @@ -1639,7 +1729,7 @@ static void rcu_work_rcufn(struct rcu_head *rcu) * * Return: %false if @rwork was already pending, %true otherwise. Note * that a full RCU grace period is guaranteed only after a %true return. - * While @rwork is guarnateed to be executed after a %false return, the + * While @rwork is guaranteed to be executed after a %false return, the * execution may happen before a full RCU grace period has passed. */ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) @@ -2181,7 +2271,7 @@ __acquires(&pool->lock) if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" - " last function: %pf\n", + " last function: %ps\n", current->comm, preempt_count(), task_pid_nr(current), worker->current_func); debug_show_held_locks(current); @@ -2500,11 +2590,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, worker = current_wq_worker(); WARN_ONCE(current->flags & PF_MEMALLOC, - "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", current->pid, current->comm, target_wq->name, target_func); WARN_ONCE(worker && ((worker->current_pwq->wq->flags & (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), - "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); } @@ -2931,6 +3021,9 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!wq_online)) return false; + if (WARN_ON(!work->func)) + return false; + if (!from_cancel) { lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); @@ -3337,11 +3430,51 @@ static int init_worker_pool(struct worker_pool *pool) return 0; } +#ifdef CONFIG_LOCKDEP +static void wq_init_lockdep(struct workqueue_struct *wq) +{ + char *lock_name; + + lockdep_register_key(&wq->key); + lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name); + if (!lock_name) + lock_name = wq->name; + + wq->lock_name = lock_name; + lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0); +} + +static void wq_unregister_lockdep(struct workqueue_struct *wq) +{ + lockdep_unregister_key(&wq->key); +} + +static void wq_free_lockdep(struct workqueue_struct *wq) +{ + if (wq->lock_name != wq->name) + kfree(wq->lock_name); +} +#else +static void wq_init_lockdep(struct workqueue_struct *wq) +{ +} + +static void wq_unregister_lockdep(struct workqueue_struct *wq) +{ +} + +static void wq_free_lockdep(struct workqueue_struct *wq) +{ +} +#endif + static void rcu_free_wq(struct rcu_head *rcu) { struct workqueue_struct *wq = container_of(rcu, struct workqueue_struct, rcu); + wq_free_lockdep(wq); + if (!(wq->flags & WQ_UNBOUND)) free_percpu(wq->cpu_pwqs); else @@ -3532,8 +3665,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Schedule RCU free. */ - if (is_last) + if (is_last) { + wq_unregister_lockdep(wq); call_rcu(&wq->rcu, rcu_free_wq); + } } /** @@ -4067,11 +4202,9 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; } -struct workqueue_struct *__alloc_workqueue_key(const char *fmt, - unsigned int flags, - int max_active, - struct lock_class_key *key, - const char *lock_name, ...) +struct workqueue_struct *alloc_workqueue(const char *fmt, + unsigned int flags, + int max_active, ...) { size_t tbl_size = 0; va_list args; @@ -4106,7 +4239,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, goto err_free_wq; } - va_start(args, lock_name); + va_start(args, max_active); vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); @@ -4123,11 +4256,11 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, INIT_LIST_HEAD(&wq->flusher_overflow); INIT_LIST_HEAD(&wq->maydays); - lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); + wq_init_lockdep(wq); INIT_LIST_HEAD(&wq->list); if (alloc_and_link_pwqs(wq) < 0) - goto err_free_wq; + goto err_unreg_lockdep; if (wq_online && init_rescuer(wq) < 0) goto err_destroy; @@ -4153,6 +4286,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, return wq; +err_unreg_lockdep: + wq_unregister_lockdep(wq); + wq_free_lockdep(wq); err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); @@ -4161,7 +4297,7 @@ err_destroy: destroy_workqueue(wq); return NULL; } -EXPORT_SYMBOL_GPL(__alloc_workqueue_key); +EXPORT_SYMBOL_GPL(alloc_workqueue); /** * destroy_workqueue - safely terminate a workqueue @@ -4214,6 +4350,7 @@ void destroy_workqueue(struct workqueue_struct *wq) kthread_stop(wq->rescuer->task); if (!(wq->flags & WQ_UNBOUND)) { + wq_unregister_lockdep(wq); /* * The base ref is never dropped on per-cpu pwqs. Directly * schedule RCU free. @@ -4444,7 +4581,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { - printk("%sWorkqueue: %s %pf", log_lvl, name, fn); + printk("%sWorkqueue: %s %ps", log_lvl, name, fn); if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); @@ -4469,7 +4606,7 @@ static void pr_cont_work(bool comma, struct work_struct *work) pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { - pr_cont("%s %pf", comma ? "," : "", work->func); + pr_cont("%s %ps", comma ? "," : "", work->func); } } @@ -4501,7 +4638,7 @@ static void show_pwq(struct pool_workqueue *pwq) if (worker->current_pwq != pwq) continue; - pr_cont("%s %d%s:%pf", comma ? "," : "", + pr_cont("%s %d%s:%ps", comma ? "," : "", task_pid_nr(worker->task), worker == pwq->wq->rescuer ? "(RESCUER)" : "", worker->current_func); @@ -4786,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool) * * WRITE_ONCE() is necessary because @worker->flags may be * tested without holding any lock in - * wq_worker_waking_up(). Without it, NOT_RUNNING test may + * wq_worker_running(). Without it, NOT_RUNNING test may * fail incorrectly leading to premature concurrency * management operations. */ diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index cb68b03ca89a..498de0e909a4 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -44,6 +44,7 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ + int sleeping; /* None */ /* * Opaque string set with work_set_desc(). Printed out with task @@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void) * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched/ and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task); +void wq_worker_running(struct task_struct *task); +void wq_worker_sleeping(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |