From 439e7271dc2b63de379e37971dc2f64d71e24f8a Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Thu, 31 Aug 2017 16:37:41 -0400 Subject: livepatch: introduce shadow variable API Add exported API for livepatch modules: klp_shadow_get() klp_shadow_alloc() klp_shadow_get_or_alloc() klp_shadow_free() klp_shadow_free_all() that implement "shadow" variables, which allow callers to associate new shadow fields to existing data structures. This is intended to be used by livepatch modules seeking to emulate additions to data structure definitions. See Documentation/livepatch/shadow-vars.txt for a summary of the new shadow variable API, including a few common use cases. See samples/livepatch/livepatch-shadow-* for example modules that demonstrate shadow variables. [jkosina@suse.cz: fix __klp_shadow_get_or_alloc() comment as spotted by Josh] Signed-off-by: Joe Lawrence Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/Makefile | 2 +- kernel/livepatch/shadow.c | 277 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 kernel/livepatch/shadow.c (limited to 'kernel') diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index 2b8bdb1925da..b36ceda6488e 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o -livepatch-objs := core.o patch.o transition.o +livepatch-objs := core.o patch.o shadow.o transition.o diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c new file mode 100644 index 000000000000..67e4360521f3 --- /dev/null +++ b/kernel/livepatch/shadow.c @@ -0,0 +1,277 @@ +/* + * shadow.c - Shadow Variables + * + * Copyright (C) 2014 Josh Poimboeuf + * Copyright (C) 2014 Seth Jennings + * Copyright (C) 2017 Joe Lawrence + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +/** + * DOC: Shadow variable API concurrency notes: + * + * The shadow variable API provides a simple relationship between an + * pair and a pointer value. It is the responsibility of the + * caller to provide any mutual exclusion required of the shadow data. + * + * Once a shadow variable is attached to its parent object via the + * klp_shadow_*alloc() API calls, it is considered live: any subsequent + * call to klp_shadow_get() may then return the shadow variable's data + * pointer. Callers of klp_shadow_*alloc() should prepare shadow data + * accordingly. + * + * The klp_shadow_*alloc() API calls may allocate memory for new shadow + * variable structures. Their implementation does not call kmalloc + * inside any spinlocks, but API callers should pass GFP flags according + * to their specific needs. + * + * The klp_shadow_hash is an RCU-enabled hashtable and is safe against + * concurrent klp_shadow_free() and klp_shadow_get() operations. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include + +static DEFINE_HASHTABLE(klp_shadow_hash, 12); + +/* + * klp_shadow_lock provides exclusive access to the klp_shadow_hash and + * the shadow variables it references. + */ +static DEFINE_SPINLOCK(klp_shadow_lock); + +/** + * struct klp_shadow - shadow variable structure + * @node: klp_shadow_hash hash table node + * @rcu_head: RCU is used to safely free this structure + * @obj: pointer to parent object + * @id: data identifier + * @data: data area + */ +struct klp_shadow { + struct hlist_node node; + struct rcu_head rcu_head; + void *obj; + unsigned long id; + char data[]; +}; + +/** + * klp_shadow_match() - verify a shadow variable matches given + * @shadow: shadow variable to match + * @obj: pointer to parent object + * @id: data identifier + * + * Return: true if the shadow variable matches. + */ +static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj, + unsigned long id) +{ + return shadow->obj == obj && shadow->id == id; +} + +/** + * klp_shadow_get() - retrieve a shadow variable data pointer + * @obj: pointer to parent object + * @id: data identifier + * + * Return: the shadow variable data element, NULL on failure. + */ +void *klp_shadow_get(void *obj, unsigned long id) +{ + struct klp_shadow *shadow; + + rcu_read_lock(); + + hash_for_each_possible_rcu(klp_shadow_hash, shadow, node, + (unsigned long)obj) { + + if (klp_shadow_match(shadow, obj, id)) { + rcu_read_unlock(); + return shadow->data; + } + } + + rcu_read_unlock(); + + return NULL; +} +EXPORT_SYMBOL_GPL(klp_shadow_get); + +void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags, bool warn_on_exist) +{ + struct klp_shadow *new_shadow; + void *shadow_data; + unsigned long flags; + + /* Check if the shadow variable already exists */ + shadow_data = klp_shadow_get(obj, id); + if (shadow_data) + goto exists; + + /* Allocate a new shadow variable for use inside the lock below */ + new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags); + if (!new_shadow) + return NULL; + + new_shadow->obj = obj; + new_shadow->id = id; + + /* Initialize the shadow variable if data provided */ + if (data) + memcpy(new_shadow->data, data, size); + + /* Look for again under the lock */ + spin_lock_irqsave(&klp_shadow_lock, flags); + shadow_data = klp_shadow_get(obj, id); + if (unlikely(shadow_data)) { + /* + * Shadow variable was found, throw away speculative + * allocation. + */ + spin_unlock_irqrestore(&klp_shadow_lock, flags); + kfree(new_shadow); + goto exists; + } + + /* No found, so attach the newly allocated one */ + hash_add_rcu(klp_shadow_hash, &new_shadow->node, + (unsigned long)new_shadow->obj); + spin_unlock_irqrestore(&klp_shadow_lock, flags); + + return new_shadow->data; + +exists: + if (warn_on_exist) { + WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id); + return NULL; + } + + return shadow_data; +} + +/** + * klp_shadow_alloc() - allocate and add a new shadow variable + * @obj: pointer to parent object + * @id: data identifier + * @data: pointer to data to attach to parent + * @size: size of attached data + * @gfp_flags: GFP mask for allocation + * + * Allocates @size bytes for new shadow variable data using @gfp_flags + * and copies @size bytes from @data into the new shadow variable's own + * data space. If @data is NULL, @size bytes are still allocated, but + * no copy is performed. The new shadow variable is then added to the + * global hashtable. + * + * If an existing shadow variable can be found, this routine + * will issue a WARN, exit early and return NULL. + * + * Return: the shadow variable data element, NULL on duplicate or + * failure. + */ +void *klp_shadow_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags) +{ + return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true); +} +EXPORT_SYMBOL_GPL(klp_shadow_alloc); + +/** + * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable + * @obj: pointer to parent object + * @id: data identifier + * @data: pointer to data to attach to parent + * @size: size of attached data + * @gfp_flags: GFP mask for allocation + * + * Returns a pointer to existing shadow data if an shadow + * variable is already present. Otherwise, it creates a new shadow + * variable like klp_shadow_alloc(). + * + * This function guarantees that only one shadow variable exists with + * the given @id for the given @obj. It also guarantees that the shadow + * variable will be initialized by the given @data only when it did not + * exist before. + * + * Return: the shadow variable data element, NULL on failure. + */ +void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, + size_t size, gfp_t gfp_flags) +{ + return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false); +} +EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc); + +/** + * klp_shadow_free() - detach and free a shadow variable + * @obj: pointer to parent object + * @id: data identifier + * + * This function releases the memory for this shadow variable + * instance, callers should stop referencing it accordingly. + */ +void klp_shadow_free(void *obj, unsigned long id) +{ + struct klp_shadow *shadow; + unsigned long flags; + + spin_lock_irqsave(&klp_shadow_lock, flags); + + /* Delete from hash */ + hash_for_each_possible(klp_shadow_hash, shadow, node, + (unsigned long)obj) { + + if (klp_shadow_match(shadow, obj, id)) { + hash_del_rcu(&shadow->node); + kfree_rcu(shadow, rcu_head); + break; + } + } + + spin_unlock_irqrestore(&klp_shadow_lock, flags); +} +EXPORT_SYMBOL_GPL(klp_shadow_free); + +/** + * klp_shadow_free_all() - detach and free all <*, id> shadow variables + * @id: data identifier + * + * This function releases the memory for all <*, id> shadow variable + * instances, callers should stop referencing them accordingly. + */ +void klp_shadow_free_all(unsigned long id) +{ + struct klp_shadow *shadow; + unsigned long flags; + int i; + + spin_lock_irqsave(&klp_shadow_lock, flags); + + /* Delete all <*, id> from hash */ + hash_for_each(klp_shadow_hash, i, shadow, node) { + if (klp_shadow_match(shadow, shadow->obj, id)) { + hash_del_rcu(&shadow->node); + kfree_rcu(shadow, rcu_head); + } + } + + spin_unlock_irqrestore(&klp_shadow_lock, flags); +} +EXPORT_SYMBOL_GPL(klp_shadow_free_all); -- cgit From 5d9da759f7587c87252ef98e70bc0b4a89e4d036 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 14 Sep 2017 14:15:36 -0700 Subject: livepatch: __klp_shadow_get_or_alloc() is local to shadow.c ... therefore make it static. Fixes: 439e7271dc2 ("livepatch: introduce shadow variable API") Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/shadow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index 67e4360521f3..fdac27588d60 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -113,7 +113,7 @@ void *klp_shadow_get(void *obj, unsigned long id) } EXPORT_SYMBOL_GPL(klp_shadow_get); -void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, +static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data, size_t size, gfp_t gfp_flags, bool warn_on_exist) { struct klp_shadow *new_shadow; -- cgit From e454cf5958538666635488030046b6a84a22d447 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Mon, 18 Sep 2017 15:30:55 -0400 Subject: bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE This is a simple non-recursive delete operation. It prunes paths of empty nodes in the tree, but it does not try to further compress the tree as nodes are removed. Signed-off-by: Craig Gallek Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1b767844a76f..9d58a576b2ae 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -389,10 +389,84 @@ out: return ret; } -static int trie_delete_elem(struct bpf_map *map, void *key) +/* Called from syscall or from eBPF program */ +static int trie_delete_elem(struct bpf_map *map, void *_key) { - /* TODO */ - return -ENOSYS; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key; + struct lpm_trie_node __rcu **trim; + struct lpm_trie_node *node; + unsigned long irq_flags; + unsigned int next_bit; + size_t matchlen = 0; + int ret = 0; + + if (key->prefixlen > trie->max_prefixlen) + return -EINVAL; + + raw_spin_lock_irqsave(&trie->lock, irq_flags); + + /* Walk the tree looking for an exact key/length match and keeping + * track of where we could begin trimming the tree. The trim-point + * is the sub-tree along the walk consisting of only single-child + * intermediate nodes and ending at a leaf node that we want to + * remove. + */ + trim = &trie->root; + node = rcu_dereference_protected( + trie->root, lockdep_is_held(&trie->lock)); + while (node) { + matchlen = longest_prefix_match(trie, node, key); + + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + /* If we hit a node that has more than one child or is a valid + * prefix itself, do not remove it. Reset the root of the trim + * path to its descendant on our path. + */ + if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || + (node->child[0] && node->child[1])) + trim = &node->child[next_bit]; + node = rcu_dereference_protected( + node->child[next_bit], lockdep_is_held(&trie->lock)); + } + + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + ret = -ENOENT; + goto out; + } + + trie->n_entries--; + + /* If the node we are removing is not a leaf node, simply mark it + * as intermediate and we are done. + */ + if (rcu_access_pointer(node->child[0]) || + rcu_access_pointer(node->child[1])) { + node->flags |= LPM_TREE_NODE_FLAG_IM; + goto out; + } + + /* trim should now point to the slot holding the start of a path from + * zero or more intermediate nodes to our leaf node for deletion. + */ + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(&trie->lock)))) { + RCU_INIT_POINTER(*trim, NULL); + trim = rcu_access_pointer(node->child[0]) ? + &node->child[0] : + &node->child[1]; + kfree_rcu(node, rcu); + } + +out: + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + return ret; } #define LPM_DATA_SIZE_MAX 256 -- cgit From f454322efbf6faee695f517c6b52c4dc03cacd3e Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 22 Aug 2017 02:16:11 +0300 Subject: signal: replace sigset_to_compat() with put_compat_sigset() There are 4 callers of sigset_to_compat() in the entire kernel. One is in sparc compat rt_sigaction(2), the rest are in kernel/signal.c itself. All are followed by copy_to_user(), and all but the sparc one are under "if it's big-endian..." ifdefs. Let's transform sigset_to_compat() into put_compat_sigset() that also calls copy_to_user(). Suggested-by: Al Viro Signed-off-by: Dmitry V. Levin Signed-off-by: Al Viro --- kernel/compat.c | 20 ++++++++++++++------ kernel/signal.c | 27 ++++++--------------------- 2 files changed, 20 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 772e038d04d9..18dd902c9052 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -497,15 +497,23 @@ sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) } EXPORT_SYMBOL_GPL(sigset_from_compat); -void -sigset_to_compat(compat_sigset_t *compat, const sigset_t *set) +int +put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, + unsigned int size) { + /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ +#ifdef __BIG_ENDIAN + compat_sigset_t v; switch (_NSIG_WORDS) { - case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3]; - case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2]; - case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1]; - case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0]; + case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; + case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; + case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; + case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; } + return copy_to_user(compat, &v, size) ? -EFAULT : 0; +#else + return copy_to_user(compat, set, size) ? -EFAULT : 0; +#endif } #ifdef CONFIG_NUMA diff --git a/kernel/signal.c b/kernel/signal.c index 800a18f77732..14ad6bb90dad 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2621,13 +2621,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, if (error) return error; } - if (oset) { - compat_sigset_t old32; - sigset_to_compat(&old32, &old_set); - if (copy_to_user(oset, &old32, sizeof(compat_sigset_t))) - return -EFAULT; - } - return 0; + return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; #else return sys_rt_sigprocmask(how, (sigset_t __user *)nset, (sigset_t __user *)oset, sigsetsize); @@ -2669,20 +2663,11 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t set; int err = do_sigpending(&set, sigsetsize); - if (!err) { - compat_sigset_t set32; - sigset_to_compat(&set32, &set); - /* we can get here only if sigsetsize <= sizeof(set) */ - if (copy_to_user(uset, &set32, sigsetsize)) - err = -EFAULT; - } + if (!err) + err = put_compat_sigset(uset, &set, sigsetsize); return err; -#else - return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize); -#endif } #endif @@ -3451,7 +3436,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, compat_size_t, sigsetsize) { struct k_sigaction new_ka, old_ka; - compat_sigset_t mask; #ifdef __ARCH_HAS_SA_RESTORER compat_uptr_t restorer; #endif @@ -3463,6 +3447,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, if (act) { compat_uptr_t handler; + compat_sigset_t mask; ret = get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(handler); #ifdef __ARCH_HAS_SA_RESTORER @@ -3478,10 +3463,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - sigset_to_compat(&mask, &old_ka.sa.sa_mask); ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); - ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); + ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask, + sizeof(oact->sa_mask)); ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), -- cgit From 1681634b8c70353d8ca211b9b3443889a16dafeb Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 22 Aug 2017 02:16:29 +0300 Subject: signal: simplify compat_sigpending() Remove "if it's big-endian..." ifdef in compat_sigpending(), use the endian-agnostic variant. Suggested-by: Al Viro Signed-off-by: Dmitry V. Levin Signed-off-by: Al Viro --- kernel/signal.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 14ad6bb90dad..f59c05fc374a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3330,15 +3330,11 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { -#ifdef __BIG_ENDIAN sigset_t set; int err = do_sigpending(&set, sizeof(set.sig[0])); if (!err) err = put_user(set.sig[0], set32); return err; -#else - return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32)); -#endif } #endif -- cgit From 176826af03666758c656dd27f098cc889b71638b Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 22 Aug 2017 02:16:43 +0300 Subject: signal: lift sigset size check out of do_sigpending() As sigsetsize argument of do_sigpending() is not used anywhere else in that function after the check, remove this argument and move the check out of do_sigpending() into rt_sigpending() and its compat analog. Suggested-by: Al Viro Signed-off-by: Dmitry V. Levin Signed-off-by: Al Viro --- kernel/signal.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f59c05fc374a..9fbc574ced10 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2629,11 +2629,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, } #endif -static int do_sigpending(void *set, unsigned long sigsetsize) +static int do_sigpending(sigset_t *set) { - if (sigsetsize > sizeof(sigset_t)) - return -EINVAL; - spin_lock_irq(¤t->sighand->siglock); sigorsets(set, ¤t->pending.signal, ¤t->signal->shared_pending.signal); @@ -2653,7 +2650,12 @@ static int do_sigpending(void *set, unsigned long sigsetsize) SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; - int err = do_sigpending(&set, sigsetsize); + int err; + + if (sigsetsize > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); if (!err && copy_to_user(uset, &set, sigsetsize)) err = -EFAULT; return err; @@ -2664,7 +2666,12 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; - int err = do_sigpending(&set, sigsetsize); + int err; + + if (sigsetsize > sizeof(*uset)) + return -EINVAL; + + err = do_sigpending(&set); if (!err) err = put_compat_sigset(uset, &set, sigsetsize); return err; @@ -3331,7 +3338,7 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; - int err = do_sigpending(&set, sizeof(set.sig[0])); + int err = do_sigpending(&set); if (!err) err = put_user(set.sig[0], set32); return err; -- cgit From b8e8e1aa9f14110da180569908bbe538c9e9dc63 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 Sep 2017 20:42:54 -0400 Subject: get rid of {get,put}_compat_itimerspec() no users left Signed-off-by: Al Viro --- kernel/compat.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 18dd902c9052..d43b18031116 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, return ret; } -int get_compat_itimerspec(struct itimerspec *dst, - const struct compat_itimerspec __user *src) -{ - if (__compat_get_timespec(&dst->it_interval, &src->it_interval) || - __compat_get_timespec(&dst->it_value, &src->it_value)) - return -EFAULT; - return 0; -} - -int put_compat_itimerspec(struct compat_itimerspec __user *dst, - const struct itimerspec *src) -{ - if (__compat_put_timespec(&src->it_interval, &dst->it_interval) || - __compat_put_timespec(&src->it_value, &dst->it_value)) - return -EFAULT; - return 0; -} - int get_compat_itimerspec64(struct itimerspec64 *its, const struct compat_itimerspec __user *uits) { -- cgit From 3968cf623892d710e651070243fd16af312a9797 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 Sep 2017 21:45:17 -0400 Subject: get_compat_sigset() similar to put_compat_sigset() Signed-off-by: Al Viro --- kernel/compat.c | 23 ++++++++++++++++------- kernel/signal.c | 27 ++++----------------------- 2 files changed, 20 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index d43b18031116..a46a4a40bb8b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -467,17 +467,26 @@ Efault: return -EFAULT; } -void -sigset_from_compat(sigset_t *set, const compat_sigset_t *compat) +int +get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) { +#ifdef __BIG_ENDIAN + compat_sigset_t v; + if (copy_from_user(&v, compat, sizeof(compat_sigset_t))) + return -EFAULT; switch (_NSIG_WORDS) { - case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); - case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); - case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); - case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); + case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); + case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); + case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); + case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } +#else + if (copy_from_user(set, compat, sizeof(compat_sigset_t))) + return -EFAULT; +#endif + return 0; } -EXPORT_SYMBOL_GPL(sigset_from_compat); +EXPORT_SYMBOL_GPL(get_compat_sigset); int put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, diff --git a/kernel/signal.c b/kernel/signal.c index 9fbc574ced10..36a523640894 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2600,7 +2600,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t old_set = current->blocked; /* XXX: Don't preclude handling different sized sigset_t's. */ @@ -2608,13 +2607,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, return -EINVAL; if (nset) { - compat_sigset_t new32; sigset_t new_set; int error; - if (copy_from_user(&new32, nset, sizeof(compat_sigset_t))) + if (get_compat_sigset(&new_set, nset)) return -EFAULT; - - sigset_from_compat(&new_set, &new32); sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); @@ -2622,10 +2618,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, return error; } return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; -#else - return sys_rt_sigprocmask(how, (sigset_t __user *)nset, - (sigset_t __user *)oset, sigsetsize); -#endif } #endif @@ -2908,7 +2900,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct compat_timespec __user *, uts, compat_size_t, sigsetsize) { - compat_sigset_t s32; sigset_t s; struct timespec t; siginfo_t info; @@ -2917,9 +2908,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) + if (get_compat_sigset(&s, uthese)) return -EFAULT; - sigset_from_compat(&s, &s32); if (uts) { if (compat_get_timespec(&t, uts)) @@ -3450,18 +3440,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, if (act) { compat_uptr_t handler; - compat_sigset_t mask; ret = get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(handler); #ifdef __ARCH_HAS_SA_RESTORER ret |= get_user(restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(restorer); #endif - ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); + ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask); ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; - sigset_from_compat(&new_ka.sa.sa_mask, &mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); @@ -3649,22 +3637,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) { -#ifdef __BIG_ENDIAN sigset_t newset; - compat_sigset_t newset32; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) + if (get_compat_sigset(&newset, unewset)) return -EFAULT; - sigset_from_compat(&newset, &newset32); return sigsuspend(&newset); -#else - /* on little-endian bitmaps don't care about granularity */ - return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize); -#endif } #endif -- cgit From abca5fc535a3ee0f36fb6d4468a453eaae769921 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 19 Sep 2017 18:17:46 -0400 Subject: sched_rr_get_interval(): move compat to native, get rid of set_fs() switch to using timespec64 internally, while we are at it Signed-off-by: Al Viro --- kernel/compat.c | 16 ---------------- kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index a46a4a40bb8b..d1cee656a7ed 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -562,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, } #endif -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, - compat_pid_t, pid, - struct compat_timespec __user *, interval) -{ - struct timespec t; - int ret; - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); - set_fs(old_fs); - if (compat_put_timespec(&t, interval)) - return -EFAULT; - return ret; -} - /* * Allocate user-space memory for the duration of a single system call, * in order to marshall parameters inside a compat thunk. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..e74f0a5a8647 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -5098,13 +5099,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * Return: On success, 0 and the timeslice is in @interval. Otherwise, * an error code. */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) +static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; unsigned int time_slice; struct rq_flags rf; - struct timespec t; struct rq *rq; int retval; @@ -5128,15 +5127,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, task_rq_unlock(rq, p, &rf); rcu_read_unlock(); - jiffies_to_timespec(time_slice, &t); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; + jiffies_to_timespec64(time_slice, t); + return 0; out_unlock: rcu_read_unlock(); return retval; } +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_timespec64(&t, interval); + + return retval; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, + compat_pid_t, pid, + struct compat_timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = compat_put_timespec64(&t, interval); + return retval; +} +#endif + void sched_show_task(struct task_struct *p) { unsigned long free = 0; -- cgit From cfb766da54d98ceb145e1bb0bd11c559569dcbfc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:04 -0700 Subject: sched/cputime: Expose cputime_adjust() Will be used by basic cgroup resource stat reporting later. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner --- kernel/sched/cputime.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 14d2dbf97c53..8839b6e8a104 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -585,9 +585,8 @@ drop_precision: * * Assuming that rtime_i+1 >= rtime_i. */ -static void cputime_adjust(struct task_cputime *curr, - struct prev_cputime *prev, - u64 *ut, u64 *st) +void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) { u64 rtime, stime, utime; unsigned long flags; -- cgit From d2cc5ed6949085cfba30ec5228816cf6eb1d02b9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:04 -0700 Subject: cpuacct: Introduce cgroup_account_cputime[_field]() Introduce cgroup_account_cputime[_field]() which wrap cpuacct_charge() and cgroup_account_field(). This doesn't introduce any functional changes and will be used to add cgroup basic resource accounting. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar --- kernel/sched/cpuacct.h | 17 ----------------- kernel/sched/cputime.c | 2 +- kernel/sched/deadline.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 2 +- kernel/sched/stop_task.c | 2 +- 7 files changed, 6 insertions(+), 23 deletions(-) delete mode 100644 kernel/sched/cpuacct.h (limited to 'kernel') diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h deleted file mode 100644 index ba72807c73d4..000000000000 --- a/kernel/sched/cpuacct.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifdef CONFIG_CGROUP_CPUACCT - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); - -#else - -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ -} - -static inline void -cpuacct_account_field(struct task_struct *tsk, int index, u64 val) -{ -} - -#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8839b6e8a104..e01b699bbd5b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __this_cpu_add(kernel_cpustat.cpustat[index], tmp); - cpuacct_account_field(p, index, tmp); + cgroup_account_cputime_field(p, index, tmp); } /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0191ec7667c3..abd913c1b99e 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1143,7 +1143,7 @@ static void update_curr_dl(struct rq *rq) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 70ba32e08a23..0ae69af95b8b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -851,7 +851,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cpuacct_charge(curtask, delta_exec); + cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0af5ca9e3e3f..fdc2c5d1f82e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -979,7 +979,7 @@ static void update_curr_rt(struct rq *rq) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14db76cd496f..f0b98f978843 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef CONFIG_PARAVIRT #include @@ -36,7 +37,6 @@ #include "cpupri.h" #include "cpudeadline.h" -#include "cpuacct.h" #ifdef CONFIG_SCHED_DEBUG # define SCHED_WARN_ON(x) WARN_ONCE(x, #x) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 9f69fb630853..ec0bb5ab9024 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -71,7 +71,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) -- cgit From 041cd640b2f3c5607171c59d8712b503659d21f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 08:12:05 -0700 Subject: cgroup: Implement cgroup2 basic CPU usage accounting In cgroup1, while cpuacct isn't actually controlling any resources, it is a separate controller due to combination of two factors - 1. enabling cpu controller has significant side effects, and 2. we have to pick one of the hierarchies to account CPU usages on. cpuacct controller is effectively used to designate a hierarchy to track CPU usages on. cgroup2's unified hierarchy removes the second reason and we can account basic CPU usages by default. While we can use cpuacct for this purpose, both its interface and implementation leave a lot to be desired - it collects and exposes two sources of truth which don't agree with each other and some of the exposed statistics don't make much sense. Also, it propagates all the way up the hierarchy on each accounting event which is unnecessary. This patch adds basic resource accounting mechanism to cgroup2's unified hierarchy and accounts CPU usages using it. * All accountings are done per-cpu and don't propagate immediately. It just bumps the per-cgroup per-cpu counters and links to the parent's updated list if not already on it. * On a read, the per-cpu counters are collected into the global ones and then propagated upwards. Only the per-cpu counters which have changed since the last read are propagated. * CPU usage stats are collected and shown in "cgroup.stat" with "cpu." prefix. Total usage is collected from scheduling events. User/sys breakdown is sourced from tick sampling and adjusted to the usage using cputime_adjust(). This keeps the accounting side hot path O(1) and per-cpu and the read side O(nr_updated_since_last_read). v2: Minor changes and documentation updates as suggested by Waiman and Roman. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner Cc: Waiman Long Cc: Roman Gushchin --- kernel/cgroup/Makefile | 2 +- kernel/cgroup/cgroup-internal.h | 8 + kernel/cgroup/cgroup.c | 24 ++- kernel/cgroup/stat.c | 334 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 365 insertions(+), 3 deletions(-) create mode 100644 kernel/cgroup/stat.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index ce693ccb8c58..0acee616e06c 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,4 +1,4 @@ -obj-y := cgroup.o namespace.o cgroup-v1.o +obj-y := cgroup.o stat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 5151ff256c29..fa642c99586a 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -199,6 +199,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int cgroup_task_count(const struct cgroup *cgrp); +/* + * stat.c + */ +void cgroup_stat_flush(struct cgroup *cgrp); +int cgroup_stat_init(struct cgroup *cgrp); +void cgroup_stat_exit(struct cgroup *cgrp); +void cgroup_stat_boot(void); + /* * namespace.c */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..d036625556c9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS +static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); + /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -struct cgroup_root cgrp_dfl_root; +struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); + cgroup_stat_show_cputime(seq, "cpu."); + return 0; } @@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work) */ cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_exit(cgrp); kfree(cgrp); } else { /* @@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ trace_cgroup_release(cgrp); + if (cgroup_on_dfl(cgrp)) + cgroup_stat_flush(cgrp); + for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; @@ -4698,6 +4707,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_free_cgrp; + if (cgroup_on_dfl(parent)) { + ret = cgroup_stat_init(cgrp); + if (ret) + goto out_cancel_ref; + } + /* * Temporarily set the pointer to NULL, so idr_find() won't return * a half-baked cgroup. @@ -4705,7 +4720,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL); if (cgrp->id < 0) { ret = -ENOMEM; - goto out_cancel_ref; + goto out_stat_exit; } init_cgroup_housekeeping(cgrp); @@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) return cgrp; +out_stat_exit: + if (cgroup_on_dfl(parent)) + cgroup_stat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5148,6 +5166,8 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); + cgroup_stat_boot(); + /* * The latency of the synchronize_sched() is too high for cgroups, * avoid it at the cost of forcing all readers into the slow path. diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c new file mode 100644 index 000000000000..9cce79e89320 --- /dev/null +++ b/kernel/cgroup/stat.c @@ -0,0 +1,334 @@ +#include "cgroup-internal.h" + +#include + +static DEFINE_MUTEX(cgroup_stat_mutex); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); + +static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->cpu_stat, cpu); +} + +/** + * cgroup_cpu_stat_updated - keep track of updated cpu_stat + * @cgrp: target cgroup + * @cpu: cpu on which cpu_stat was updated + * + * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching + * cpu_stat->updated_children list. See the comment on top of + * cgroup_cpu_stat definition for details. + */ +static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *parent; + unsigned long flags; + + /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * + * Because @parent's updated_children is terminated with @parent + * instead of NULL, we can tell whether @cgrp is on the list by + * testing the next pointer for NULL. + */ + if (cgroup_cpu_stat(cgrp, cpu)->updated_next) + return; + + raw_spin_lock_irqsave(cpu_lock, flags); + + /* put @cgrp and all ancestors on the corresponding updated lists */ + for (parent = cgroup_parent(cgrp); parent; + cgrp = parent, parent = cgroup_parent(cgrp)) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + + /* + * Both additions and removals are bottom-up. If a cgroup + * is already in the tree, all ancestors are. + */ + if (cstat->updated_next) + break; + + cstat->updated_next = pcstat->updated_children; + pcstat->updated_children = cgrp; + } + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + +/** + * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree + * @pos: current position + * @root: root of the tree to traversal + * @cpu: target cpu + * + * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts + * the traversal and %NULL return indicates the end. During traversal, + * each returned cgroup is unlinked from the tree. Must be called with the + * matching cgroup_cpu_stat_lock held. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, if a child is visited, its parent is + * guaranteed to be visited afterwards. + */ +static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) +{ + struct cgroup_cpu_stat *cstat; + struct cgroup *parent; + + if (pos == root) + return NULL; + + /* + * We're gonna walk down to the first leaf and visit/remove it. We + * can pick whatever unvisited node as the starting point. + */ + if (!pos) + pos = root; + else + pos = cgroup_parent(pos); + + /* walk down to the first leaf */ + while (true) { + cstat = cgroup_cpu_stat(pos, cpu); + if (cstat->updated_children == pos) + break; + pos = cstat->updated_children; + } + + /* + * Unlink @pos from the tree. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + * However, due to the way we traverse, @pos will be the first + * child in most cases. The only exception is @root. + */ + parent = cgroup_parent(pos); + if (parent && cstat->updated_next) { + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + struct cgroup_cpu_stat *ncstat; + struct cgroup **nextp; + + nextp = &pcstat->updated_children; + while (true) { + ncstat = cgroup_cpu_stat(*nextp, cpu); + if (*nextp == pos) + break; + + WARN_ON_ONCE(*nextp == parent); + nextp = &ncstat->updated_next; + } + + *nextp = cstat->updated_next; + cstat->updated_next = NULL; + } + + return pos; +} + +static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, + struct cgroup_stat *src_stat) +{ + dst_stat->cputime.utime += src_stat->cputime.utime; + dst_stat->cputime.stime += src_stat->cputime.stime; + dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; +} + +static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct task_cputime *last_cputime = &cstat->last_cputime; + struct task_cputime cputime; + struct cgroup_stat delta; + unsigned seq; + + lockdep_assert_held(&cgroup_stat_mutex); + + /* fetch the current per-cpu values */ + do { + seq = __u64_stats_fetch_begin(&cstat->sync); + cputime = cstat->cputime; + } while (__u64_stats_fetch_retry(&cstat->sync, seq)); + + /* accumulate the deltas to propgate */ + delta.cputime.utime = cputime.utime - last_cputime->utime; + delta.cputime.stime = cputime.stime - last_cputime->stime; + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - + last_cputime->sum_exec_runtime; + *last_cputime = cputime; + + /* transfer the pending stat into delta */ + cgroup_stat_accumulate(&delta, &cgrp->pending_stat); + memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); + + /* propagate delta into the global stat and the parent's pending */ + cgroup_stat_accumulate(&cgrp->stat, &delta); + if (parent) + cgroup_stat_accumulate(&parent->pending_stat, &delta); +} + +/* see cgroup_stat_flush() */ +static void cgroup_stat_flush_locked(struct cgroup *cgrp) +{ + int cpu; + + lockdep_assert_held(&cgroup_stat_mutex); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *pos = NULL; + + raw_spin_lock_irq(cpu_lock); + while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) + cgroup_cpu_stat_flush_one(pos, cpu); + raw_spin_unlock_irq(cpu_lock); + } +} + +/** + * cgroup_stat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + */ +void cgroup_stat_flush(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_stat_mutex); + cgroup_stat_flush_locked(cgrp); + mutex_unlock(&cgroup_stat_mutex); +} + +static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) +{ + struct cgroup_cpu_stat *cstat; + + cstat = get_cpu_ptr(cgrp->cpu_stat); + u64_stats_update_begin(&cstat->sync); + return cstat; +} + +static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, + struct cgroup_cpu_stat *cstat) +{ + u64_stats_update_end(&cstat->sync); + cgroup_cpu_stat_updated(cgrp, smp_processor_id()); + put_cpu_ptr(cstat); +} + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + cstat->cputime.sum_exec_runtime += delta_exec; + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + + switch (index) { + case CPUTIME_USER: + case CPUTIME_NICE: + cstat->cputime.utime += delta_exec; + break; + case CPUTIME_SYSTEM: + case CPUTIME_IRQ: + case CPUTIME_SOFTIRQ: + cstat->cputime.stime += delta_exec; + break; + default: + break; + } + + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime; + + if (!cgroup_parent(cgrp)) + return; + + mutex_lock(&cgroup_stat_mutex); + + cgroup_stat_flush_locked(cgrp); + + usage = cgrp->stat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, + &utime, &stime); + + mutex_unlock(&cgroup_stat_mutex); + + do_div(usage, NSEC_PER_USEC); + do_div(utime, NSEC_PER_USEC); + do_div(stime, NSEC_PER_USEC); + + seq_printf(seq, "%susage_usec %llu\n" + "%suser_usec %llu\n" + "%ssystem_usec %llu\n", + prefix, usage, prefix, utime, prefix, stime); +} + +int cgroup_stat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has cpu_stat preallocated */ + if (!cgrp->cpu_stat) { + cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); + if (!cgrp->cpu_stat) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) + cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + + prev_cputime_init(&cgrp->stat.prev_cputime); + + return 0; +} + +void cgroup_stat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_stat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + if (WARN_ON_ONCE(cstat->updated_children != cgrp) || + WARN_ON_ONCE(cstat->updated_next)) + return; + } + + free_percpu(cgrp->cpu_stat); + cgrp->cpu_stat = NULL; +} + +void __init cgroup_stat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); + + BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); +} -- cgit From 38683148828165ea0b66ace93a9fedc2d3281e27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 13:50:20 -0700 Subject: cgroup: statically initialize init_css_set->dfl_cgrp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like other csets, init_css_set's dfl_cgrp is initialized when the cset gets linked. init_css_set gets linked in cgroup_init(). This has been fine till now but the recently added basic CPU usage accounting may end up accessing dfl_cgrp of init before cgroup_init() leading to the following oops. SELinux: Initializing. BUG: unable to handle kernel NULL pointer dereference at 00000000000000b0 IP: account_system_index_time+0x60/0x90 PGD 0 P4D 0 Oops: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc2-00003-g041cd64 #10 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS +1.9.3-20161025_171302-gandalf 04/01/2014 task: ffffffff81e10480 task.stack: ffffffff81e00000 RIP: 0010:account_system_index_time+0x60/0x90 RSP: 0000:ffff880011e03cb8 EFLAGS: 00010002 RAX: ffffffff81ef8800 RBX: ffffffff81e10480 RCX: 0000000000000003 RDX: 0000000000000000 RSI: 00000000000f4240 RDI: 0000000000000000 RBP: ffff880011e03cc0 R08: 0000000000010000 R09: 0000000000000000 R10: 0000000000000020 R11: 0000003b9aca0000 R12: 000000000001c100 R13: 0000000000000000 R14: ffffffff81e10480 R15: ffffffff81e03cd8 FS: 0000000000000000(0000) GS:ffff880011e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000b0 CR3: 0000000001e09000 CR4: 00000000000006b0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: account_system_time+0x45/0x60 account_process_tick+0x5a/0x140 update_process_times+0x22/0x60 tick_periodic+0x2b/0x90 tick_handle_periodic+0x25/0x70 timer_interrupt+0x15/0x20 __handle_irq_event_percpu+0x7e/0x1b0 handle_irq_event_percpu+0x23/0x60 handle_irq_event+0x42/0x70 handle_level_irq+0x83/0x100 handle_irq+0x6f/0x110 do_IRQ+0x46/0xd0 common_interrupt+0x9d/0x9d Fix it by statically initializing init_css_set.dfl_cgrp so that init's default cgroup is accessible from the get-go. Fixes: 041cd640b2f3 ("cgroup: Implement cgroup2 basic CPU usage accounting") Reported-by: “kbuild-all@01.org” Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d036625556c9..7975b20f1fd1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -649,6 +649,14 @@ struct css_set init_css_set = { .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), + + /* + * The following field is re-initialized when this cset gets linked + * in cgroup_init(). However, let's initialize the field + * statically too so that the default cgroup can be accessed safely + * early during boot. + */ + .dfl_cgrp = &cgrp_dfl_root.cgrp, }; static int css_set_count = 1; /* 1 for init_css_set */ -- cgit From 8157a7faf94135386bf04b1cf94e6efd3fb94702 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 14:27:54 -0700 Subject: sched/cputime: Add dummy cputime_adjust() implementation for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cfb766da54d9 ("sched/cputime: Expose cputime_adjust()") made cputime_adjust() public for cgroup basic cpu stat support; however, the commit forgot to add a dummy implementaiton for CONFIG_VIRT_CPU_ACCOUNTING_NATIVE leading to compiler errors on some s390 configurations. Fix it by adding the missing dummy implementation. Reported-by: “kbuild-all@01.org” Fixes: cfb766da54d9 ("sched/cputime: Expose cputime_adjust()") Signed-off-by: Tejun Heo --- kernel/sched/cputime.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e01b699bbd5b..5498f20d2475 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -447,6 +447,13 @@ void vtime_account_irq_enter(struct task_struct *tsk) EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ +void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) +{ + *ut = curr->utime; + *st = curr->stime; +} + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) { *ut = p->utime; -- cgit From b5d7388f9db78f19e6af7b56a469ca8d1860329d Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Thu, 21 Sep 2017 18:43:29 -0400 Subject: bpf: Optimize lpm trie delete Before the delete operator was added, this datastructure maintained an invariant that intermediate nodes were only present when necessary to build the tree. This patch updates the delete operation to reinstate that invariant by removing unnecessary intermediate nodes after a node is removed and thus keeping the tree structure at a minimal size. Suggested-by: Daniel Mack Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- kernel/bpf/lpm_trie.c | 71 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 9d58a576b2ae..34d8a690ea05 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -394,8 +394,8 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key; - struct lpm_trie_node __rcu **trim; - struct lpm_trie_node *node; + struct lpm_trie_node __rcu **trim, **trim2; + struct lpm_trie_node *node, *parent; unsigned long irq_flags; unsigned int next_bit; size_t matchlen = 0; @@ -407,31 +407,26 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) raw_spin_lock_irqsave(&trie->lock, irq_flags); /* Walk the tree looking for an exact key/length match and keeping - * track of where we could begin trimming the tree. The trim-point - * is the sub-tree along the walk consisting of only single-child - * intermediate nodes and ending at a leaf node that we want to - * remove. + * track of the path we traverse. We will need to know the node + * we wish to delete, and the slot that points to the node we want + * to delete. We may also need to know the nodes parent and the + * slot that contains it. */ trim = &trie->root; - node = rcu_dereference_protected( - trie->root, lockdep_is_held(&trie->lock)); - while (node) { + trim2 = trim; + parent = NULL; + while ((node = rcu_dereference_protected( + *trim, lockdep_is_held(&trie->lock)))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || node->prefixlen == key->prefixlen) break; + parent = node; + trim2 = trim; next_bit = extract_bit(key->data, node->prefixlen); - /* If we hit a node that has more than one child or is a valid - * prefix itself, do not remove it. Reset the root of the trim - * path to its descendant on our path. - */ - if (!(node->flags & LPM_TREE_NODE_FLAG_IM) || - (node->child[0] && node->child[1])) - trim = &node->child[next_bit]; - node = rcu_dereference_protected( - node->child[next_bit], lockdep_is_held(&trie->lock)); + trim = &node->child[next_bit]; } if (!node || node->prefixlen != key->prefixlen || @@ -442,27 +437,47 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) trie->n_entries--; - /* If the node we are removing is not a leaf node, simply mark it + /* If the node we are removing has two children, simply mark it * as intermediate and we are done. */ - if (rcu_access_pointer(node->child[0]) || + if (rcu_access_pointer(node->child[0]) && rcu_access_pointer(node->child[1])) { node->flags |= LPM_TREE_NODE_FLAG_IM; goto out; } - /* trim should now point to the slot holding the start of a path from - * zero or more intermediate nodes to our leaf node for deletion. + /* If the parent of the node we are about to delete is an intermediate + * node, and the deleted node doesn't have any children, we can delete + * the intermediate parent as well and promote its other child + * up the tree. Doing this maintains the invariant that all + * intermediate nodes have exactly 2 children and that there are no + * unnecessary intermediate nodes in the tree. */ - while ((node = rcu_dereference_protected( - *trim, lockdep_is_held(&trie->lock)))) { - RCU_INIT_POINTER(*trim, NULL); - trim = rcu_access_pointer(node->child[0]) ? - &node->child[0] : - &node->child[1]; + if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) && + !node->child[0] && !node->child[1]) { + if (node == rcu_access_pointer(parent->child[0])) + rcu_assign_pointer( + *trim2, rcu_access_pointer(parent->child[1])); + else + rcu_assign_pointer( + *trim2, rcu_access_pointer(parent->child[0])); + kfree_rcu(parent, rcu); kfree_rcu(node, rcu); + goto out; } + /* The node we are removing has either zero or one child. If there + * is a child, move it into the removed node's slot then delete + * the node. Otherwise just clear the slot and delete the node. + */ + if (node->child[0]) + rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0])); + else if (node->child[1]) + rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); + else + RCU_INIT_POINTER(*trim, NULL); + kfree_rcu(node, rcu); + out: raw_spin_unlock_irqrestore(&trie->lock, irq_flags); -- cgit From 05e3db95ebfc5c06a29a1d8c7a3e02f46f3a25a7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 14 Sep 2017 14:02:04 -0700 Subject: kthread: add a mechanism to store cgroup info kthread usually runs jobs on behalf of other threads. The jobs should be charged to cgroup of original threads. But the jobs run in a kthread, where we lose the cgroup context of original threads. The patch adds a machanism to record cgroup info of original threads in kthread context. Later we can retrieve the cgroup info and attach the cgroup info to jobs. Since this mechanism is only required by kthread, we store the cgroup info in kthread data instead of generic task_struct. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/kthread.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 1c19edf82427..b011ea08967f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,7 +20,6 @@ #include #include #include -#include #include static DEFINE_SPINLOCK(kthread_create_lock); @@ -47,6 +46,9 @@ struct kthread { void *data; struct completion parked; struct completion exited; +#ifdef CONFIG_CGROUPS + struct cgroup_subsys_state *blkcg_css; +#endif }; enum KTHREAD_BITS { @@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k) void free_kthread_struct(struct task_struct *k) { + struct kthread *kthread; + /* * Can be NULL if this kthread was created by kernel_thread() * or if kmalloc() in kthread() failed. */ - kfree(to_kthread(k)); + kthread = to_kthread(k); +#ifdef CONFIG_CGROUPS + WARN_ON_ONCE(kthread && kthread->blkcg_css); +#endif + kfree(kthread); } /** @@ -216,6 +224,9 @@ static int kthread(void *_create) self->data = data; init_completion(&self->exited); init_completion(&self->parked); +#ifdef CONFIG_CGROUPS + self->blkcg_css = NULL; +#endif current->vfork_done = &self->exited; /* OK, tell user we're spawned, wait for stop or wakeup */ @@ -1154,3 +1165,54 @@ void kthread_destroy_worker(struct kthread_worker *worker) kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); + +#ifdef CONFIG_CGROUPS +/** + * kthread_associate_blkcg - associate blkcg to current kthread + * @css: the cgroup info + * + * Current thread must be a kthread. The thread is running jobs on behalf of + * other threads. In some cases, we expect the jobs attach cgroup info of + * original threads instead of that of current thread. This function stores + * original thread's cgroup info in current kthread context for later + * retrieval. + */ +void kthread_associate_blkcg(struct cgroup_subsys_state *css) +{ + struct kthread *kthread; + + if (!(current->flags & PF_KTHREAD)) + return; + kthread = to_kthread(current); + if (!kthread) + return; + + if (kthread->blkcg_css) { + css_put(kthread->blkcg_css); + kthread->blkcg_css = NULL; + } + if (css) { + css_get(css); + kthread->blkcg_css = css; + } +} +EXPORT_SYMBOL(kthread_associate_blkcg); + +/** + * kthread_blkcg - get associated blkcg css of current kthread + * + * Current thread must be a kthread. + */ +struct cgroup_subsys_state *kthread_blkcg(void) +{ + struct kthread *kthread; + + if (current->flags & PF_KTHREAD) { + kthread = to_kthread(current); + if (kthread) + return kthread->blkcg_css; + } + return NULL; +} +EXPORT_SYMBOL(kthread_blkcg); +#endif -- cgit From 0b508bc926bdced678febee2a2b8cdba0a19e481 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 26 Sep 2017 11:02:12 -0700 Subject: block: fix a build error The code is only for blkcg not for all cgroups Fixes: d4478e92d618 ("block/loop: make loop cgroup aware") Reported-by: kbuild test robot Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/kthread.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index b011ea08967f..f87cd8b4eb2a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -46,7 +46,7 @@ struct kthread { void *data; struct completion parked; struct completion exited; -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif }; @@ -83,7 +83,7 @@ void free_kthread_struct(struct task_struct *k) * or if kmalloc() in kthread() failed. */ kthread = to_kthread(k); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread && kthread->blkcg_css); #endif kfree(kthread); @@ -224,7 +224,7 @@ static int kthread(void *_create) self->data = data; init_completion(&self->exited); init_completion(&self->parked); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP self->blkcg_css = NULL; #endif current->vfork_done = &self->exited; @@ -1166,7 +1166,7 @@ void kthread_destroy_worker(struct kthread_worker *worker) } EXPORT_SYMBOL(kthread_destroy_worker); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info -- cgit From 6aaae2b6c4330a46204bca042f1d2f41e8e18dea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:50 +0200 Subject: bpf: rename bpf_compute_data_end into bpf_compute_data_pointers Just do the rename into bpf_compute_data_pointers() as we'll add one more pointer here to recompute. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/sockmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 6424ce0e4969..a298d6666698 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) skb_orphan(skb); skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; @@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); -- cgit From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 25 Sep 2017 02:25:51 +0200 Subject: bpf: add meta pointer for direct access This work enables generic transfer of metadata from XDP into skb. The basic idea is that we can make use of the fact that the resulting skb must be linear and already comes with a larger headroom for supporting bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work on a similar principle and introduce a small helper bpf_xdp_adjust_meta() for adjusting a new pointer called xdp->data_meta. Thus, the packet has a flexible and programmable room for meta data, followed by the actual packet data. struct xdp_buff is therefore laid out that we first point to data_hard_start, then data_meta directly prepended to data followed by data_end marking the end of packet. bpf_xdp_adjust_head() takes into account whether we have meta data already prepended and if so, memmove()s this along with the given offset provided there's enough room. xdp->data_meta is optional and programs are not required to use it. The rationale is that when we process the packet in XDP (e.g. as DoS filter), we can push further meta data along with it for the XDP_PASS case, and give the guarantee that a clsact ingress BPF program on the same device can pick this up for further post-processing. Since we work with skb there, we can also set skb->mark, skb->priority or other skb meta data out of BPF, thus having this scratch space generic and programmable allows for more flexibility than defining a direct 1:1 transfer of potentially new XDP members into skb (it's also more efficient as we don't need to initialize/handle each of such new members). The facility also works together with GRO aggregation. The scratch space at the head of the packet can be multiple of 4 byte up to 32 byte large. Drivers not yet supporting xdp->data_meta can simply be set up with xdp->data_meta as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out, such that the subsequent match against xdp->data for later access is guaranteed to fail. The verifier treats xdp->data_meta/xdp->data the same way as we treat xdp->data/xdp->data_end pointer comparisons. The requirement for doing the compare against xdp->data is that it hasn't been modified from it's original address we got from ctx access. It may have a range marking already from prior successful xdp->data/xdp->data_end pointer comparisons though. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 114 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 86 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b914fbe1383e..f849eca36052 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...) va_end(args); } +static bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -187,6 +193,7 @@ static const char * const reg_type_str[] = { [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", [PTR_TO_STACK] = "fp", [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", }; @@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state) verbose("(id=%d", reg->id); if (t != SCALAR_VALUE) verbose(",off=%d", reg->off); - if (t == PTR_TO_PACKET) + if (type_is_pkt_pointer(t)) verbose(",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || @@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) __mark_reg_known_zero(regs + regno); } +static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) +{ + return type_is_pkt_pointer(reg->type); +} + +static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) +{ + return reg_is_pkt_pointer(reg) || + reg->type == PTR_TO_PACKET_END; +} + +/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ +static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, + enum bpf_reg_type which) +{ + /* The register can already have a range from prior markings. + * This is fine as long as it hasn't been advanced from its + * origin. + */ + return reg->type == which && + reg->id == 0 && + reg->off == 0 && + tnum_equals_const(reg->var_off, 0); +} + /* Attempts to improve min/max values based on var_off information */ static void __update_reg_bounds(struct bpf_reg_state *reg) { @@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: + case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: case CONST_PTR_TO_MAP: return true; @@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, switch (reg->type) { case PTR_TO_PACKET: - /* special case, because of NET_IP_ALIGN */ + case PTR_TO_PACKET_META: + /* Special case, because of NET_IP_ALIGN. Given metadata sits + * right in front, treat it the very same way. + */ return check_pkt_ptr_alignment(reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; @@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a - * PTR_TO_PACKET[_END]. In the latter case, we know - * the offset is zero. + * PTR_TO_PACKET[_META,_END]. In the latter + * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) mark_reg_unknown(state->regs, value_regno); @@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { err = check_stack_read(state, off, size, value_regno); } - } else if (reg->type == PTR_TO_PACKET) { + } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); return -EACCES; @@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (reg->type) { case PTR_TO_PACKET: + case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size); @@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return 0; } - if (type == PTR_TO_PACKET && + if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { verbose("helper access to the packet is not allowed\n"); return -EACCES; @@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (type != PTR_TO_PACKET && type != expected_type) + if (!type_is_pkt_pointer(type) && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (register_is_null(*reg)) /* final test in check_stack_boundary() */; - else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; @@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->key_size); else @@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - if (type == PTR_TO_PACKET) + if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, meta->map_ptr->value_size); else @@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn) return count > 1 ? -EINVAL : 0; } -/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid, - * so turn them into unknown SCALAR_VALUE. +/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] + * are now invalid, so turn them into unknown SCALAR_VALUE. */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { @@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) int i; for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET || - regs[i].type == PTR_TO_PACKET_END) + if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(regs, i); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type != PTR_TO_PACKET && - reg->type != PTR_TO_PACKET_END) - continue; - __mark_reg_unknown(reg); + if (reg_is_pkt_pointer_any(reg)) + __mark_reg_unknown(reg); } } @@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ dst_reg->range = 0; @@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); dst_reg->off = ptr_reg->off; - if (ptr_reg->type == PTR_TO_PACKET) { + if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) @@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } static void find_good_pkt_pointers(struct bpf_verifier_state *state, - struct bpf_reg_state *dst_reg) + struct bpf_reg_state *dst_reg, + enum bpf_reg_type type) { struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id) + if (regs[i].type == type && regs[i].id == dst_reg->id) /* keep the maximum range already checked */ regs[i].range = max_t(u16, regs[i].range, dst_reg->off); @@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, if (state->stack_slot_type[i] != STACK_SPILL) continue; reg = &state->spilled_regs[i / BPF_REG_SIZE]; - if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) + if (reg->type == type && reg->id == dst_reg->id) reg->range = max_t(u16, reg->range, dst_reg->off); } } @@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(this_branch, dst_reg); + find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && dst_reg->type == PTR_TO_PACKET && regs[insn->src_reg].type == PTR_TO_PACKET_END) { - find_good_pkt_pointers(other_branch, dst_reg); + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(other_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET); } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && dst_reg->type == PTR_TO_PACKET_END && regs[insn->src_reg].type == PTR_TO_PACKET) { - find_good_pkt_pointers(this_branch, ®s[insn->src_reg]); + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && + dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { + find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(other_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && + reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + regs[insn->src_reg].type == PTR_TO_PACKET_META) { + find_good_pkt_pointers(this_branch, ®s[insn->src_reg], + PTR_TO_PACKET_META); } else if (is_pointer_value(env, insn->dst_reg)) { verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; @@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; /* Check our ids match any regs they're supposed to */ return check_ids(rold->id, rcur->id, idmap); + case PTR_TO_PACKET_META: case PTR_TO_PACKET: - if (rcur->type != PTR_TO_PACKET) + if (rcur->type != rold->type) return false; /* We must have at least as much range as the old ptr * did, so that any accesses which were safe before are -- cgit From 2b7c6ba945fd3c10ca3e030be402098aff2f89d3 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 26 Sep 2017 16:35:13 +0100 Subject: bpf/verifier: improve disassembly of BPF_END instructions print_bpf_insn() was treating all BPF_ALU[64] the same, but BPF_END has a different structure: it has a size in insn->imm (even if it's BPF_X) and uses the BPF_SRC (X or K) to indicate which endianness to use. So it needs different code to print it. Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f849eca36052..e8d7bb8e6b98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -332,26 +332,40 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; +static void print_bpf_end_insn(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + static void print_bpf_insn(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_SRC(insn->code) == BPF_X) + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose("BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(env, insn); + } else if (BPF_SRC(insn->code) == BPF_X) { verbose("(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); - else + } else { verbose("(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->imm); + } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", -- cgit From 73c864b38383f4abc9b559025cd663f4a81afa89 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 26 Sep 2017 16:35:29 +0100 Subject: bpf/verifier: improve disassembly of BPF_NEG instructions BPF_NEG takes only one operand, unlike the bulk of BPF_ALU[64] which are compound-assignments. So give it its own format in print_bpf_insn(). Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e8d7bb8e6b98..4cf9b72c59a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -351,6 +351,11 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, verbose("BUG_alu64_%02x\n", insn->code); else print_bpf_end_insn(env, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose("(%02x) r%d = %s-r%d\n", + insn->code, insn->dst_reg, + class == BPF_ALU ? "(u32) " : "", + insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { verbose("(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", -- cgit From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:52 -0700 Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info The patch adds name and load_time to struct bpf_prog_aux. They are also exported to bpf_prog_info. The bpf_prog's name is passed by userspace during BPF_PROG_LOAD. The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes and the name stored in the kernel is always \0 terminated. The kernel will reject name that contains characters other than isalnum() and '_'. It will also reject name that is not null terminated. The existing 'user->uid' of the bpf_prog_aux is also exported to the bpf_prog_info as created_by_uid. The existing 'used_maps' of the bpf_prog_aux is exported to the newly added members 'nr_map_ids' and 'map_ids' of the bpf_prog_info. On the input, nr_map_ids tells how big the userspace's map_ids buffer is. On the output, nr_map_ids tells the exact user_map_cnt and it will only copy up to the userspace's map_ids buffer is allowed. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 25d074920a00..45970df3f820 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -312,6 +315,30 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL +/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. + * Return 0 on success and < 0 on error. + */ +static int bpf_obj_name_cpy(char *dst, const char *src) +{ + const char *end = src + BPF_OBJ_NAME_LEN; + + /* Copy all isalnum() and '_' char */ + while (src < end && *src) { + if (!isalnum(*src) && *src != '_') + return -EINVAL; + *dst++ = *src++; + } + + /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ + if (src == end) + return -EINVAL; + + /* '\0' terminates dst */ + *dst = 0; + + return 0; +} + #define BPF_MAP_CREATE_LAST_FIELD numa_node /* called via syscall */ static int map_create(union bpf_attr *attr) @@ -973,7 +1000,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_flags +#define BPF_PROG_LOAD_LAST_FIELD prog_name static int bpf_prog_load(union bpf_attr *attr) { @@ -1037,6 +1064,11 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_prog; + prog->aux->load_time = ktime_get_boot_ns(); + err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); + if (err) + goto free_prog; + /* run eBPF verifier */ err = bpf_check(&prog, attr); if (err < 0) @@ -1358,8 +1390,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.type = prog->type; info.id = prog->aux->id; + info.load_time = prog->aux->load_time; + info.created_by_uid = from_kuid_munged(current_user_ns(), + prog->aux->user->uid); memcpy(info.tag, prog->tag, sizeof(prog->tag)); + memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); + + ulen = info.nr_map_ids; + info.nr_map_ids = prog->aux->used_map_cnt; + ulen = min_t(u32, info.nr_map_ids, ulen); + if (ulen) { + u32 *user_map_ids = (u32 *)info.map_ids; + u32 i; + + for (i = 0; i < ulen; i++) + if (put_user(prog->aux->used_maps[i]->id, + &user_map_ids[i])) + return -EFAULT; + } if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; -- cgit From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 27 Sep 2017 14:37:53 -0700 Subject: bpf: Add map_name to bpf_map_info This patch allows userspace to specify a name for a map during BPF_MAP_CREATE. The map's name can later be exported to user space via BPF_OBJ_GET_INFO_BY_FD. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 45970df3f820..11a7f82a55d1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -339,7 +339,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD numa_node +#define BPF_MAP_CREATE_LAST_FIELD map_name /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -361,6 +361,10 @@ static int map_create(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = bpf_obj_name_cpy(map->name, attr->map_name); + if (err) + goto free_map_nouncharge; + atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); @@ -1462,6 +1466,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.value_size = map->value_size; info.max_entries = map->max_entries; info.map_flags = map->map_flags; + memcpy(info.name, map->name, sizeof(map->name)); if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) -- cgit From a1f7164c7b8b0d46f63bfb4ca0bb5971c760b921 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 09:00:18 -0700 Subject: sched: Misc preps for cgroup unified hierarchy interface Make the following changes in preparation for the cpu controller interface implementation for cgroup2. This patch doesn't cause any functional differences. * s/cpu_stats_show()/cpu_cfs_stat_show()/ * s/cpu_files/cpu_legacy_files/ v2: Dropped cpuacct changes as it won't be used by cpu controller interface anymore. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner --- kernel/sched/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18a6966567da..6815fa424a7a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6599,7 +6599,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct seq_file *sf, void *v) +static int cpu_cfs_stat_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; @@ -6639,7 +6639,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ -static struct cftype cpu_files[] = { +static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", @@ -6660,7 +6660,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .seq_show = cpu_stats_show, + .seq_show = cpu_cfs_stat_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -6686,7 +6686,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .legacy_cftypes = cpu_files, + .legacy_cftypes = cpu_legacy_files, .early_init = true, }; -- cgit From 0d5936344f30aba0f6ddb92b030cb6a05168efe6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Sep 2017 09:00:19 -0700 Subject: sched: Implement interface for cgroup unified hierarchy There are a couple interface issues which can be addressed in cgroup2 interface. * Stats from cpuacct being reported separately from the cpu stats. * Use of different time units. Writable control knobs use microseconds, some stat fields use nanoseconds while other cpuacct stat fields use centiseconds. * Control knobs which can't be used in the root cgroup still show up in the root. * Control knob names and semantics aren't consistent with other controllers. This patchset implements cpu controller's interface on cgroup2 which adheres to the controller file conventions described in Documentation/cgroups/cgroup-v2.txt. Overall, the following changes are made. * cpuacct is implictly enabled and disabled by cpu and its information is reported through "cpu.stat" which now uses microseconds for all time durations. All time duration fields now have "_usec" appended to them for clarity. Note that cpuacct.usage_percpu is currently not included in "cpu.stat". If this information is actually called for, it will be added later. * "cpu.shares" is replaced with "cpu.weight" and operates on the standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000). The weight is scaled to scheduler weight so that 100 maps to 1024 and the ratio relationship is preserved - if weight is W and its scaled value is S, W / 100 == S / 1024. While the mapped range is a bit smaller than the orignal scheduler weight range, the dead zones on both sides are relatively small and covers wider range than the nice value mappings. This file doesn't make sense in the root cgroup and isn't created on root. * "cpu.weight.nice" is added. When read, it reads back the nice value which is closest to the current "cpu.weight". When written, it sets "cpu.weight" to the weight value which matches the nice value. This makes it easy to configure cgroups when they're competing against threads in threaded subtrees. * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max" which contains both quota and period. v4: - Use cgroup2 basic usage stat as the information source instead of cpuacct. v3: - Added "cpu.weight.nice" to allow using nice values when configuring the weight. The feature is requested by PeterZ. - Merge the patch to enable threaded support on cpu and cpuacct. - Dropped the bits about getting rid of cpuacct from patch description as there is a pretty strong case for making cpuacct an implicit controller so that basic cpu usage stats are always available. - Documentation updated accordingly. "cpu.rt.max" section is dropped for now. v2: - cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for CFS bandwidth stats and also using raw division for u64. Use CONFIG_CFS_BANDWITH and do_div() instead. "cpu.rt.max" is not included yet. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Li Zefan Cc: Johannes Weiner --- kernel/sched/core.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6815fa424a7a..ad255162a830 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6678,6 +6678,175 @@ static struct cftype cpu_legacy_files[] = { { } /* Terminate */ }; +static int cpu_stat_show(struct seq_file *sf, void *v) +{ + cgroup_stat_show_cputime(sf, ""); + +#ifdef CONFIG_CFS_BANDWIDTH + { + struct task_group *tg = css_tg(seq_css(sf)); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + u64 throttled_usec; + + throttled_usec = cfs_b->throttled_time; + do_div(throttled_usec, NSEC_PER_USEC); + + seq_printf(sf, "nr_periods %d\n" + "nr_throttled %d\n" + "throttled_usec %llu\n", + cfs_b->nr_periods, cfs_b->nr_throttled, + throttled_usec); + } +#endif + return 0; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + u64 weight = scale_load_down(tg->shares); + + return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); +} + +static int cpu_weight_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 weight) +{ + /* + * cgroup weight knobs should use the common MIN, DFL and MAX + * values which are 1, 100 and 10000 respectively. While it loses + * a bit of range on both ends, it maps pretty well onto the shares + * value used by scheduler and the round-trip conversions preserve + * the original value over the entire range. + */ + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + return -ERANGE; + + weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + + return sched_group_set_shares(css_tg(css), scale_load(weight)); +} + +static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + unsigned long weight = scale_load_down(css_tg(css)->shares); + int last_delta = INT_MAX; + int prio, delta; + + /* find the closest nice value to the current weight */ + for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) { + delta = abs(sched_prio_to_weight[prio] - weight); + if (delta >= last_delta) + break; + last_delta = delta; + } + + return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); +} + +static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 nice) +{ + unsigned long weight; + + if (nice < MIN_NICE || nice > MAX_NICE) + return -ERANGE; + + weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; + return sched_group_set_shares(css_tg(css), scale_load(weight)); +} +#endif + +static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, + long period, long quota) +{ + if (quota < 0) + seq_puts(sf, "max"); + else + seq_printf(sf, "%ld", quota); + + seq_printf(sf, " %ld\n", period); +} + +/* caller should put the current value in *@periodp before calling */ +static int __maybe_unused cpu_period_quota_parse(char *buf, + u64 *periodp, u64 *quotap) +{ + char tok[21]; /* U64_MAX */ + + if (!sscanf(buf, "%s %llu", tok, periodp)) + return -EINVAL; + + *periodp *= NSEC_PER_USEC; + + if (sscanf(tok, "%llu", quotap)) + *quotap *= NSEC_PER_USEC; + else if (!strcmp(tok, "max")) + *quotap = RUNTIME_INF; + else + return -EINVAL; + + return 0; +} + +#ifdef CONFIG_CFS_BANDWIDTH +static int cpu_max_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + + cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); + return 0; +} + +static ssize_t cpu_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct task_group *tg = css_tg(of_css(of)); + u64 period = tg_get_cfs_period(tg); + u64 quota; + int ret; + + ret = cpu_period_quota_parse(buf, &period, "a); + if (!ret) + ret = tg_set_cfs_bandwidth(tg, period, quota); + return ret ?: nbytes; +} +#endif + +static struct cftype cpu_files[] = { + { + .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_stat_show, + }, +#ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_weight_read_u64, + .write_u64 = cpu_weight_write_u64, + }, + { + .name = "weight.nice", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_weight_nice_read_s64, + .write_s64 = cpu_weight_nice_write_s64, + }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_max_show, + .write = cpu_max_write, + }, +#endif + { } /* terminate */ +}; + struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, @@ -6687,7 +6856,9 @@ struct cgroup_subsys cpu_cgrp_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_legacy_files, + .dfl_cftypes = cpu_files, .early_init = true, + .threaded = true, }; #endif /* CONFIG_CGROUP_SCHED */ -- cgit From 721e08dad17e226ef68819d0a23dc53c25fe8ea5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 29 Sep 2017 10:52:17 -0700 Subject: bpf: Fix compiler warning on info.map_ids for 32bit platform This patch uses u64_to_user_ptr() to cast info.map_ids to a userspace ptr. It also tags the user_map_ids with '__user' for sparse check. Fixes: cb4d2b3f03d8 ("bpf: Add name, load_time, uid and map_ids to bpf_prog_info") Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 11a7f82a55d1..b927da66f653 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1405,7 +1405,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.nr_map_ids = prog->aux->used_map_cnt; ulen = min_t(u32, info.nr_map_ids, ulen); if (ulen) { - u32 *user_map_ids = (u32 *)info.map_ids; + u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); u32 i; for (i = 0; i < ulen; i++) -- cgit From 7813dd6fc75fb375d4caf002e7f80a826fc3153a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 26 Sep 2017 15:12:40 -0700 Subject: PM / OPP: Move the OPP directory out of power/ The drivers/base/power/ directory is special and contains code related to power management core like system suspend/resume, hibernation, etc. It was fine to keep the OPP code inside it when we had just one file for it, but it is growing now and already has a directory for itself. Lets move it directly under drivers/ directory, just like cpufreq and cpuidle. Signed-off-by: Viresh Kumar Acked-by: Stephen Boyd Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e8517b63eb37..e880ca22c5a5 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -259,20 +259,6 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config PM_OPP - bool - select SRCU - ---help--- - SOCs have a standard set of tuples consisting of frequency and - voltage pairs that the device will support per voltage domain. This - is called Operating Performance Point or OPP. The actual definitions - of OPP varies over silicon within the same family of devices. - - OPP layer organizes the data internally using device pointers - representing individual voltage domains and provides SOC - implementations a ready to use framework to manage OPPs. - For more information, read - config PM_CLK def_bool y depends on PM && HAVE_CLK -- cgit From 64ec72a1ece37d9bc7ba8b11d6091ce7cb1d8eec Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 27 Sep 2017 22:01:34 -0700 Subject: PM: Use a more common logging style Convert printks to pr_. Miscellanea: o Use pr_fmt with "PM:" and remove "PM: " from format strings o Coalesce format strings and realign format arguments o Convert an embedded incorrect function name to "%s: ", __func__ o Convert a couple multi-line formats to multiple pr_ calls Signed-off-by: Joe Perches Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 4 +- kernel/power/snapshot.c | 35 ++++++------- kernel/power/swap.c | 128 +++++++++++++++++++++--------------------------- 3 files changed, 77 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..9d7503910ce2 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -701,8 +701,8 @@ static int __init pm_qos_power_init(void) for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { ret = register_pm_qos_misc(pm_qos_array[i], d); if (ret < 0) { - printk(KERN_ERR "pm_qos_param: %s setup failed\n", - pm_qos_array[i]->name); + pr_err("%s: %s setup failed\n", + __func__, pm_qos_array[i]->name); return ret; } } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0972a8e09d08..a917a301e201 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -10,6 +10,8 @@ * */ +#define pr_fmt(fmt) "PM: " fmt + #include #include #include @@ -967,7 +969,7 @@ void __init __register_nosave_region(unsigned long start_pfn, region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); Report: - printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n", + pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n", (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); } @@ -1039,7 +1041,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm) list_for_each_entry(region, &nosave_regions, list) { unsigned long pfn; - pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", + pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n", (unsigned long long) region->start_pfn << PAGE_SHIFT, ((unsigned long long) region->end_pfn << PAGE_SHIFT) - 1); @@ -1095,7 +1097,7 @@ int create_basic_memory_bitmaps(void) free_pages_map = bm2; mark_nosave_pages(forbidden_pages_map); - pr_debug("PM: Basic memory bitmaps created\n"); + pr_debug("Basic memory bitmaps created\n"); return 0; @@ -1131,7 +1133,7 @@ void free_basic_memory_bitmaps(void) memory_bm_free(bm2, PG_UNSAFE_CLEAR); kfree(bm2); - pr_debug("PM: Basic memory bitmaps freed\n"); + pr_debug("Basic memory bitmaps freed\n"); } void clear_free_pages(void) @@ -1152,7 +1154,7 @@ void clear_free_pages(void) pfn = memory_bm_next_pfn(bm); } memory_bm_position_reset(bm); - pr_info("PM: free pages cleared after restore\n"); + pr_info("free pages cleared after restore\n"); #endif /* PAGE_POISONING_ZERO */ } @@ -1690,7 +1692,7 @@ int hibernate_preallocate_memory(void) ktime_t start, stop; int error; - printk(KERN_INFO "PM: Preallocating image memory... "); + pr_info("Preallocating image memory... "); start = ktime_get(); error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); @@ -1821,13 +1823,13 @@ int hibernate_preallocate_memory(void) out: stop = ktime_get(); - printk(KERN_CONT "done (allocated %lu pages)\n", pages); + pr_cont("done (allocated %lu pages)\n", pages); swsusp_show_speed(start, stop, pages, "Allocated"); return 0; err_out: - printk(KERN_CONT "\n"); + pr_cont("\n"); swsusp_free(); return -ENOMEM; } @@ -1867,8 +1869,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) free += zone_page_state(zone, NR_FREE_PAGES); nr_pages += count_pages_for_highmem(nr_highmem); - pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n", - nr_pages, PAGES_FOR_IO, free); + pr_debug("Normal pages needed: %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, free); return free > nr_pages + PAGES_FOR_IO; } @@ -1961,20 +1963,20 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - printk(KERN_INFO "PM: Creating hibernation image:\n"); + pr_info("Creating hibernation image:\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); nr_highmem = count_highmem_pages(); - printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem); + pr_info("Need to copy %u pages\n", nr_pages + nr_highmem); if (!enough_free_mem(nr_pages, nr_highmem)) { - printk(KERN_ERR "PM: Not enough free memory\n"); + pr_err("Not enough free memory\n"); return -ENOMEM; } if (swsusp_alloc(©_bm, nr_pages, nr_highmem)) { - printk(KERN_ERR "PM: Memory allocation failed\n"); + pr_err("Memory allocation failed\n"); return -ENOMEM; } @@ -1995,8 +1997,7 @@ asmlinkage __visible int swsusp_save(void) nr_copy_pages = nr_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n", - nr_pages); + pr_info("Hibernation image created (%d pages copied)\n", nr_pages); return 0; } @@ -2170,7 +2171,7 @@ static int check_header(struct swsusp_info *info) if (!reason && info->num_physpages != get_num_physpages()) reason = "memory size"; if (reason) { - printk(KERN_ERR "PM: Image mismatch: %s\n", reason); + pr_err("Image mismatch: %s\n", reason); return -EPERM; } return 0; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index d7cdc426ee38..293ead59eccc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -12,6 +12,8 @@ * */ +#define pr_fmt(fmt) "PM: " fmt + #include #include #include @@ -241,9 +243,9 @@ static void hib_end_io(struct bio *bio) struct page *page = bio->bi_io_vec[0].bv_page; if (bio->bi_status) { - printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), + (unsigned long long)bio->bi_iter.bi_sector); } if (bio_data_dir(bio) == WRITE) @@ -273,8 +275,8 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", - (unsigned long long)bio->bi_iter.bi_sector); + pr_err("Adding page to bio failed at %llu\n", + (unsigned long long)bio->bi_iter.bi_sector); bio_put(bio); return -EFAULT; } @@ -319,7 +321,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC, swsusp_resume_block, swsusp_header, NULL); } else { - printk(KERN_ERR "PM: Swap header not found!\n"); + pr_err("Swap header not found!\n"); error = -ENODEV; } return error; @@ -413,8 +415,7 @@ static int get_swap_writer(struct swap_map_handle *handle) ret = swsusp_swap_check(); if (ret) { if (ret != -ENOSPC) - printk(KERN_ERR "PM: Cannot find swap device, try " - "swapon -a.\n"); + pr_err("Cannot find swap device, try swapon -a\n"); return ret; } handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); @@ -491,9 +492,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, { if (!error) { flush_swap_writer(handle); - printk(KERN_INFO "PM: S"); + pr_info("S"); error = mark_swapfiles(handle, flags); - printk("|\n"); + pr_cont("|\n"); } if (error) @@ -542,7 +543,7 @@ static int save_image(struct swap_map_handle *handle, hib_init_batch(&hb); - printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", + pr_info("Saving image data pages (%u pages)...\n", nr_to_write); m = nr_to_write / 10; if (!m) @@ -557,8 +558,8 @@ static int save_image(struct swap_map_handle *handle, if (ret) break; if (!(nr_pages % m)) - printk(KERN_INFO "PM: Image saving progress: %3d%%\n", - nr_pages / m * 10); + pr_info("Image saving progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } err2 = hib_wait_io(&hb); @@ -566,7 +567,7 @@ static int save_image(struct swap_map_handle *handle, if (!ret) ret = err2; if (!ret) - printk(KERN_INFO "PM: Image saving done.\n"); + pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); return ret; } @@ -692,14 +693,14 @@ static int save_image_lzo(struct swap_map_handle *handle, page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!page) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + pr_err("Failed to allocate LZO page\n"); ret = -ENOMEM; goto out_clean; } data = vmalloc(sizeof(*data) * nr_threads); if (!data) { - printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; goto out_clean; } @@ -708,7 +709,7 @@ static int save_image_lzo(struct swap_map_handle *handle, crc = kmalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { - printk(KERN_ERR "PM: Failed to allocate crc\n"); + pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } @@ -726,8 +727,7 @@ static int save_image_lzo(struct swap_map_handle *handle, "image_compress/%u", thr); if (IS_ERR(data[thr].thr)) { data[thr].thr = NULL; - printk(KERN_ERR - "PM: Cannot start compression threads\n"); + pr_err("Cannot start compression threads\n"); ret = -ENOMEM; goto out_clean; } @@ -749,7 +749,7 @@ static int save_image_lzo(struct swap_map_handle *handle, crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); if (IS_ERR(crc->thr)) { crc->thr = NULL; - printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + pr_err("Cannot start CRC32 thread\n"); ret = -ENOMEM; goto out_clean; } @@ -760,10 +760,9 @@ static int save_image_lzo(struct swap_map_handle *handle, */ handle->reqd_free_pages = reqd_free_pages(); - printk(KERN_INFO - "PM: Using %u thread(s) for compression.\n" - "PM: Compressing and saving image data (%u pages)...\n", - nr_threads, nr_to_write); + pr_info("Using %u thread(s) for compression\n", nr_threads); + pr_info("Compressing and saving image data (%u pages)...\n", + nr_to_write); m = nr_to_write / 10; if (!m) m = 1; @@ -783,10 +782,8 @@ static int save_image_lzo(struct swap_map_handle *handle, data_of(*snapshot), PAGE_SIZE); if (!(nr_pages % m)) - printk(KERN_INFO - "PM: Image saving progress: " - "%3d%%\n", - nr_pages / m * 10); + pr_info("Image saving progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } if (!off) @@ -813,15 +810,14 @@ static int save_image_lzo(struct swap_map_handle *handle, ret = data[thr].ret; if (ret < 0) { - printk(KERN_ERR "PM: LZO compression failed\n"); + pr_err("LZO compression failed\n"); goto out_finish; } if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > lzo1x_worst_compress(data[thr].unc_len))) { - printk(KERN_ERR - "PM: Invalid LZO compressed length\n"); + pr_err("Invalid LZO compressed length\n"); ret = -1; goto out_finish; } @@ -857,7 +853,7 @@ out_finish: if (!ret) ret = err2; if (!ret) - printk(KERN_INFO "PM: Image saving done.\n"); + pr_info("Image saving done\n"); swsusp_show_speed(start, stop, nr_to_write, "Wrote"); out_clean: if (crc) { @@ -888,7 +884,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags) unsigned int free_swap = count_swap_pages(root_swap, 1); unsigned int required; - pr_debug("PM: Free swap pages: %u\n", free_swap); + pr_debug("Free swap pages: %u\n", free_swap); required = PAGES_FOR_IO + nr_pages; return free_swap > required; @@ -915,12 +911,12 @@ int swsusp_write(unsigned int flags) pages = snapshot_get_image_size(); error = get_swap_writer(&handle); if (error) { - printk(KERN_ERR "PM: Cannot get swap writer\n"); + pr_err("Cannot get swap writer\n"); return error; } if (flags & SF_NOCOMPRESS_MODE) { if (!enough_swap(pages, flags)) { - printk(KERN_ERR "PM: Not enough free swap\n"); + pr_err("Not enough free swap\n"); error = -ENOSPC; goto out_finish; } @@ -1068,8 +1064,7 @@ static int load_image(struct swap_map_handle *handle, hib_init_batch(&hb); clean_pages_on_read = true; - printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n", - nr_to_read); + pr_info("Loading image data pages (%u pages)...\n", nr_to_read); m = nr_to_read / 10; if (!m) m = 1; @@ -1087,8 +1082,8 @@ static int load_image(struct swap_map_handle *handle, if (ret) break; if (!(nr_pages % m)) - printk(KERN_INFO "PM: Image loading progress: %3d%%\n", - nr_pages / m * 10); + pr_info("Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; } err2 = hib_wait_io(&hb); @@ -1096,7 +1091,7 @@ static int load_image(struct swap_map_handle *handle, if (!ret) ret = err2; if (!ret) { - printk(KERN_INFO "PM: Image loading done.\n"); + pr_info("Image loading done\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; @@ -1190,14 +1185,14 @@ static int load_image_lzo(struct swap_map_handle *handle, page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); if (!page) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + pr_err("Failed to allocate LZO page\n"); ret = -ENOMEM; goto out_clean; } data = vmalloc(sizeof(*data) * nr_threads); if (!data) { - printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; goto out_clean; } @@ -1206,7 +1201,7 @@ static int load_image_lzo(struct swap_map_handle *handle, crc = kmalloc(sizeof(*crc), GFP_KERNEL); if (!crc) { - printk(KERN_ERR "PM: Failed to allocate crc\n"); + pr_err("Failed to allocate crc\n"); ret = -ENOMEM; goto out_clean; } @@ -1226,8 +1221,7 @@ static int load_image_lzo(struct swap_map_handle *handle, "image_decompress/%u", thr); if (IS_ERR(data[thr].thr)) { data[thr].thr = NULL; - printk(KERN_ERR - "PM: Cannot start decompression threads\n"); + pr_err("Cannot start decompression threads\n"); ret = -ENOMEM; goto out_clean; } @@ -1249,7 +1243,7 @@ static int load_image_lzo(struct swap_map_handle *handle, crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); if (IS_ERR(crc->thr)) { crc->thr = NULL; - printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + pr_err("Cannot start CRC32 thread\n"); ret = -ENOMEM; goto out_clean; } @@ -1274,8 +1268,7 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!page[i]) { if (i < LZO_CMP_PAGES) { ring_size = i; - printk(KERN_ERR - "PM: Failed to allocate LZO pages\n"); + pr_err("Failed to allocate LZO pages\n"); ret = -ENOMEM; goto out_clean; } else { @@ -1285,10 +1278,9 @@ static int load_image_lzo(struct swap_map_handle *handle, } want = ring_size = i; - printk(KERN_INFO - "PM: Using %u thread(s) for decompression.\n" - "PM: Loading and decompressing image data (%u pages)...\n", - nr_threads, nr_to_read); + pr_info("Using %u thread(s) for decompression\n", nr_threads); + pr_info("Loading and decompressing image data (%u pages)...\n", + nr_to_read); m = nr_to_read / 10; if (!m) m = 1; @@ -1348,8 +1340,7 @@ static int load_image_lzo(struct swap_map_handle *handle, if (unlikely(!data[thr].cmp_len || data[thr].cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { - printk(KERN_ERR - "PM: Invalid LZO compressed length\n"); + pr_err("Invalid LZO compressed length\n"); ret = -1; goto out_finish; } @@ -1400,16 +1391,14 @@ static int load_image_lzo(struct swap_map_handle *handle, ret = data[thr].ret; if (ret < 0) { - printk(KERN_ERR - "PM: LZO decompression failed\n"); + pr_err("LZO decompression failed\n"); goto out_finish; } if (unlikely(!data[thr].unc_len || data[thr].unc_len > LZO_UNC_SIZE || data[thr].unc_len & (PAGE_SIZE - 1))) { - printk(KERN_ERR - "PM: Invalid LZO uncompressed length\n"); + pr_err("Invalid LZO uncompressed length\n"); ret = -1; goto out_finish; } @@ -1420,10 +1409,8 @@ static int load_image_lzo(struct swap_map_handle *handle, data[thr].unc + off, PAGE_SIZE); if (!(nr_pages % m)) - printk(KERN_INFO - "PM: Image loading progress: " - "%3d%%\n", - nr_pages / m * 10); + pr_info("Image loading progress: %3d%%\n", + nr_pages / m * 10); nr_pages++; ret = snapshot_write_next(snapshot); @@ -1448,15 +1435,14 @@ out_finish: } stop = ktime_get(); if (!ret) { - printk(KERN_INFO "PM: Image loading done.\n"); + pr_info("Image loading done\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; if (!ret) { if (swsusp_header->flags & SF_CRC32_MODE) { if(handle->crc32 != swsusp_header->crc32) { - printk(KERN_ERR - "PM: Invalid image CRC32!\n"); + pr_err("Invalid image CRC32!\n"); ret = -ENODATA; } } @@ -1513,9 +1499,9 @@ int swsusp_read(unsigned int *flags_p) swap_reader_finish(&handle); end: if (!error) - pr_debug("PM: Image successfully loaded\n"); + pr_debug("Image successfully loaded\n"); else - pr_debug("PM: Error %d resuming\n", error); + pr_debug("Error %d resuming\n", error); return error; } @@ -1552,13 +1538,13 @@ put: if (error) blkdev_put(hib_resume_bdev, FMODE_READ); else - pr_debug("PM: Image signature found, resuming\n"); + pr_debug("Image signature found, resuming\n"); } else { error = PTR_ERR(hib_resume_bdev); } if (error) - pr_debug("PM: Image not found (code %d)\n", error); + pr_debug("Image not found (code %d)\n", error); return error; } @@ -1570,7 +1556,7 @@ put: void swsusp_close(fmode_t mode) { if (IS_ERR(hib_resume_bdev)) { - pr_debug("PM: Image device not initialised\n"); + pr_debug("Image device not initialised\n"); return; } @@ -1594,7 +1580,7 @@ int swsusp_unmark(void) swsusp_resume_block, swsusp_header, NULL); } else { - printk(KERN_ERR "PM: Cannot find swsusp signature!\n"); + pr_err("Cannot find swsusp signature!\n"); error = -ENODEV; } -- cgit From d8c4deee6dc6876ae3b7d09a5b58138a57ee45f6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 8 Sep 2017 23:55:17 -0700 Subject: tracing: Remove obsolete sched_switch tracer selftest Since commit 87d80de2800d087ea833cb79bc13f85ff34ed49f ("tracing: Remove obsolete sched_switch tracer"), the sched_switch tracer selftest is no longer used. This patch removes the same. Link: http://lkml.kernel.org/r/20170909065517.22262-1-joelaf@google.com Cc: Ingo Molnar Cc: kernel-team@android.com Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 -- kernel/trace/trace_selftest.c | 32 -------------------------------- 2 files changed, 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 652c682707cd..3c078e2ad777 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -738,8 +738,6 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_sched_switch(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); /* diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b17ec642793b..364f78abdf47 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1150,38 +1150,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_SCHED_TRACER */ -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -int -trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(&tr->trace_buffer, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ - #ifdef CONFIG_BRANCH_TRACER int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) -- cgit From 6e7a2398114a2597a0995f96f44d908741ae5035 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 23 Aug 2017 12:23:09 +0100 Subject: tracing: Remove redundant unread variable ret Integer ret is being assigned but never used and hence it is redundant. Remove it, fixes clang warning: trace_events_hist.c:1077:3: warning: Value stored to 'ret' is never read Link: http://lkml.kernel.org/r/20170823112309.19383-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1c21d0e2a145..f123b5d0c226 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1062,7 +1062,7 @@ static void hist_trigger_show(struct seq_file *m, struct event_trigger_data *data, int n) { struct hist_trigger_data *hist_data; - int n_entries, ret = 0; + int n_entries; if (n > 0) seq_puts(m, "\n\n"); @@ -1073,10 +1073,8 @@ static void hist_trigger_show(struct seq_file *m, hist_data = data->private_data; n_entries = print_entries(m, hist_data); - if (n_entries < 0) { - ret = n_entries; + if (n_entries < 0) n_entries = 0; - } seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", (u64)atomic64_read(&hist_data->map->hits), -- cgit From 12ecef0cb12102d8c034770173d2d1363cb97d52 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Sep 2017 16:22:49 -0400 Subject: tracing: Reverse the order of trace_types_lock and event_mutex In order to make future changes where we need to call tracing_set_clock() from within an event command, the order of trace_types_lock and event_mutex must be reversed, as the event command will hold event_mutex and the trace_types_lock is taken from within tracing_set_clock(). Link: http://lkml.kernel.org/r/20170921162249.0dde3dca@gandalf.local.home Requested-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 5 +++++ kernel/trace/trace_events.c | 31 +++++++++++++++---------------- 2 files changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 752e5daf0896..5f1ac7d3402c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7687,6 +7687,7 @@ static int instance_mkdir(const char *name) struct trace_array *tr; int ret; + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = -EEXIST; @@ -7742,6 +7743,7 @@ static int instance_mkdir(const char *name) list_add(&tr->list, &ftrace_trace_arrays); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return 0; @@ -7753,6 +7755,7 @@ static int instance_mkdir(const char *name) out_unlock: mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return ret; @@ -7765,6 +7768,7 @@ static int instance_rmdir(const char *name) int ret; int i; + mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); ret = -ENODEV; @@ -7810,6 +7814,7 @@ static int instance_rmdir(const char *name) out_unlock: mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return ret; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 87468398b9ed..ec0f9aa4e151 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1406,8 +1406,8 @@ static int subsystem_open(struct inode *inode, struct file *filp) return -ENODEV; /* Make sure the system still exists */ - mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { list_for_each_entry(dir, &tr->systems, list) { if (dir == inode->i_private) { @@ -1421,8 +1421,8 @@ static int subsystem_open(struct inode *inode, struct file *filp) } } exit_loop: - mutex_unlock(&event_mutex); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); if (!system) return -ENODEV; @@ -2294,15 +2294,15 @@ static void __add_event_to_tracers(struct trace_event_call *call); int trace_add_event_call(struct trace_event_call *call) { int ret; - mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); ret = __register_event(call, NULL); if (ret >= 0) __add_event_to_tracers(call); - mutex_unlock(&event_mutex); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return ret; } @@ -2356,13 +2356,13 @@ int trace_remove_event_call(struct trace_event_call *call) { int ret; - mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); down_write(&trace_event_sem); ret = probe_remove_event_call(call); up_write(&trace_event_sem); - mutex_unlock(&event_mutex); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return ret; } @@ -2424,8 +2424,8 @@ static int trace_module_notify(struct notifier_block *self, { struct module *mod = data; - mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); switch (val) { case MODULE_STATE_COMING: trace_module_add_events(mod); @@ -2434,8 +2434,8 @@ static int trace_module_notify(struct notifier_block *self, trace_module_remove_events(mod); break; } - mutex_unlock(&event_mutex); mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); return 0; } @@ -2950,24 +2950,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) * creates the event hierachry in the @parent/events directory. * * Returns 0 on success. + * + * Must be called with event_mutex held. */ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) { int ret; - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); ret = create_event_toplevel_files(parent, tr); if (ret) - goto out_unlock; + goto out; down_write(&trace_event_sem); __trace_add_event_dirs(tr); up_write(&trace_event_sem); - out_unlock: - mutex_unlock(&event_mutex); - + out: return ret; } @@ -2996,9 +2996,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) return ret; } +/* Must be called with event_mutex held */ int event_trace_del_tracer(struct trace_array *tr) { - mutex_lock(&event_mutex); + lockdep_assert_held(&event_mutex); /* Disable any event triggers and associated soft-disabled events */ clear_event_triggers(tr); @@ -3019,8 +3020,6 @@ int event_trace_del_tracer(struct trace_array *tr) tr->event_dir = NULL; - mutex_unlock(&event_mutex); - return 0; } -- cgit From 1a149d7d3f45d311da1f63473736c05f30ae8a75 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 22 Sep 2017 16:59:02 -0400 Subject: ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler The current method to prevent the ring buffer from entering into a recursize loop is to use a bitmask and set the bit that maps to the current context (normal, softirq, irq or NMI), and if that bit was already set, it is considered a recursive loop. New code is being added that may require the ring buffer to be entered a second time in the current context. The recursive locking prevents that from happening. Instead of mapping a bitmask to the current context, just allow 4 levels of nesting in the ring buffer. This matches the 4 context levels that it can already nest. It is highly unlikely to have more than two levels, thus it should be fine when we add the second entry into the ring buffer. If that proves to be a problem, we can always up the number to 8. An added benefit is that reading preempt_count() to get the current level adds a very slight but noticeable overhead. This removes that need. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 64 ++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 81279c6602ff..f6ee9b1ef62a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2538,61 +2538,29 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. To pass this - * information from the lock to the unlock without having to - * access the 'in_interrupt()' functions again (which do show - * a bit of overhead in something as critical as function tracing, - * we use a bitmask trick. + * be modified more than once via an interrupt. There are four + * different contexts that we need to consider. * - * bit 0 = NMI context - * bit 1 = IRQ context - * bit 2 = SoftIRQ context - * bit 3 = normal context. + * Normal context. + * SoftIRQ context + * IRQ context + * NMI context * - * This works because this is the order of contexts that can - * preempt other contexts. A SoftIRQ never preempts an IRQ - * context. - * - * When the context is determined, the corresponding bit is - * checked and set (if it was set, then a recursion of that context - * happened). - * - * On unlock, we need to clear this bit. To do so, just subtract - * 1 from the current_context and AND it to itself. - * - * (binary) - * 101 - 1 = 100 - * 101 & 100 = 100 (clearing bit zero) - * - * 1010 - 1 = 1001 - * 1010 & 1001 = 1000 (clearing bit 1) - * - * The least significant bit can be cleared this way, and it - * just so happens that it is the same bit corresponding to - * the current context. + * If for some reason the ring buffer starts to recurse, we + * only allow that to happen at most 4 times (one for each + * context). If it happens 5 times, then we consider this a + * recusive loop and do not let it go further. */ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned int val = cpu_buffer->current_context; - int bit; - - if (in_interrupt()) { - if (in_nmi()) - bit = RB_CTX_NMI; - else if (in_irq()) - bit = RB_CTX_IRQ; - else - bit = RB_CTX_SOFTIRQ; - } else - bit = RB_CTX_NORMAL; - - if (unlikely(val & (1 << bit))) + if (cpu_buffer->current_context >= 4) return 1; - val |= (1 << bit); - cpu_buffer->current_context = val; + cpu_buffer->current_context++; + /* Interrupts must see this update */ + barrier(); return 0; } @@ -2600,7 +2568,9 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - cpu_buffer->current_context &= cpu_buffer->current_context - 1; + /* Don't let the dec leak out */ + barrier(); + cpu_buffer->current_context--; } /** -- cgit From a15f7fc20389a8827d5859907568b201234d4b79 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:17 -0500 Subject: tracing: Exclude 'generic fields' from histograms There are a small number of 'generic fields' (comm/COMM/cpu/CPU) that are found by trace_find_event_field() but are only meant for filtering. Specifically, they unlike normal fields, they have a size of 0 and thus wreak havoc when used as a histogram key. Exclude these (return -EINVAL) when used as histogram keys. Link: http://lkml.kernel.org/r/956154cbc3e8a4f0633d619b886c97f0f0edf7b4.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f123b5d0c226..121d56850f24 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -450,7 +450,7 @@ static int create_val_field(struct hist_trigger_data *hist_data, } field = trace_find_event_field(file->event_call, field_name); - if (!field) { + if (!field || !field->size) { ret = -EINVAL; goto out; } @@ -548,7 +548,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, } field = trace_find_event_field(file->event_call, field_name); - if (!field) { + if (!field || !field->size) { ret = -EINVAL; goto out; } -- cgit From 83c07ecc4203728e85fc4a2ce6fdf25d16ea118e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:18 -0500 Subject: tracing: Remove lookups from tracing_map hitcount Lookups inflate the hitcount, making it essentially useless. Only inserts and updates should really affect the hitcount anyway, so explicitly filter lookups out. Link: http://lkml.kernel.org/r/c8d9dc39d269a8abf88bf4102d0dfc65deb0fc7f.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 305039b122fa..07e75344725b 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -428,7 +428,8 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) if (test_key && test_key == key_hash && entry->val && keys_match(key, entry->val->key, map->key_size)) { - atomic64_inc(&map->hits); + if (!lookup_only) + atomic64_inc(&map->hits); return entry->val; } -- cgit From 4f36c2d85cedea60ad424d44534121ab0458069e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:19 -0500 Subject: tracing: Increase tracing map KEYS_MAX size The current default for the number of subkeys in a compound key is 2, which is too restrictive. Increase it to a more realistic value of 3. Link: http://lkml.kernel.org/r/b6952cca06d1f912eba33804a6fd6069b3847d44.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 618838f5f30a..f0975110b967 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -5,7 +5,7 @@ #define TRACING_MAP_BITS_MAX 17 #define TRACING_MAP_BITS_MIN 7 -#define TRACING_MAP_KEYS_MAX 2 +#define TRACING_MAP_KEYS_MAX 3 #define TRACING_MAP_VALS_MAX 3 #define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ TRACING_MAP_VALS_MAX) -- cgit From 7e465baa80293ed5f87fdf6405391d6f02110d4e Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:20 -0500 Subject: tracing: Make traceprobe parsing code reusable traceprobe_probes_write() and traceprobe_command() actually contain nothing that ties them to kprobes - the code is generically useful for similar types of parsing elsewhere, so separate it out and move it to trace.c/trace.h. Other than moving it, the only change is in naming: traceprobe_probes_write() becomes trace_parse_run_command() and traceprobe_command() becomes trace_run_command(). Link: http://lkml.kernel.org/r/ae5c26ea40c196a8986854d921eb6e713ede7e3f.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 86 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 7 ++++ kernel/trace/trace_kprobe.c | 18 +++++----- kernel/trace/trace_probe.c | 86 --------------------------------------------- kernel/trace/trace_probe.h | 7 ---- kernel/trace/trace_uprobe.c | 2 +- 6 files changed, 103 insertions(+), 103 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5f1ac7d3402c..73e67b68c53b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8281,6 +8281,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) } EXPORT_SYMBOL_GPL(ftrace_dump); +int trace_run_command(const char *buf, int (*createfn)(int, char **)) +{ + char **argv; + int argc, ret; + + argc = 0; + ret = 0; + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = createfn(argc, argv); + + argv_free(argv); + + return ret; +} + +#define WRITE_BUFSIZE 4096 + +ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos, + int (*createfn)(int, char **)) +{ + char *kbuf, *buf, *tmp; + int ret = 0; + size_t done = 0; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + while (done < count) { + size = count - done; + + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + buf = kbuf; + do { + tmp = strchr(buf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - buf + 1; + } else { + size = strlen(buf); + if (done + size < count) { + if (buf != kbuf) + break; + /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE - 2); + ret = -EINVAL; + goto out; + } + } + done += size; + + /* Remove comments */ + tmp = strchr(buf, '#'); + + if (tmp) + *tmp = '\0'; + + ret = trace_run_command(buf, createfn); + if (ret) + goto out; + buf += size; + + } while (done < count); + } + ret = done; + +out: + kfree(kbuf); + + return ret; +} + __init static int tracer_alloc_buffers(void) { int ring_buf_size; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3c078e2ad777..f8343eb3c1f9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1752,6 +1752,13 @@ void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); +#define MAX_EVENT_NAME_LEN 64 + +extern int trace_run_command(const char *buf, int (*createfn)(int, char**)); +extern ssize_t trace_parse_run_command(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos, + int (*createfn)(int, char**)); + /* * Normal trace_printk() and friends allocates special buffers * to do the manipulation, as well as saves the print formats diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8a907e12b6b9..af6134f2e597 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -907,8 +907,8 @@ static int probes_open(struct inode *inode, struct file *file) static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - return traceprobe_probes_write(file, buffer, count, ppos, - create_trace_kprobe); + return trace_parse_run_command(file, buffer, count, ppos, + create_trace_kprobe); } static const struct file_operations kprobe_events_ops = { @@ -1433,9 +1433,9 @@ static __init int kprobe_trace_self_tests_init(void) pr_info("Testing kprobe tracing: "); - ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " - "$stack $stack0 +0($stack)", - create_trace_kprobe); + ret = trace_run_command("p:testprobe kprobe_trace_selftest_target " + "$stack $stack0 +0($stack)", + create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function entry.\n"); warn++; @@ -1455,8 +1455,8 @@ static __init int kprobe_trace_self_tests_init(void) } } - ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " - "$retval", create_trace_kprobe); + ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target " + "$retval", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on probing function return.\n"); warn++; @@ -1526,13 +1526,13 @@ static __init int kprobe_trace_self_tests_init(void) disable_trace_kprobe(tk, file); } - ret = traceprobe_command("-:testprobe", create_trace_kprobe); + ret = trace_run_command("-:testprobe", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; } - ret = traceprobe_command("-:testprobe2", create_trace_kprobe); + ret = trace_run_command("-:testprobe2", create_trace_kprobe); if (WARN_ON_ONCE(ret)) { pr_warn("error on deleting a probe.\n"); warn++; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 52478f033f88..d59357308677 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -623,92 +623,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) kfree(arg->comm); } -int traceprobe_command(const char *buf, int (*createfn)(int, char **)) -{ - char **argv; - int argc, ret; - - argc = 0; - ret = 0; - argv = argv_split(GFP_KERNEL, buf, &argc); - if (!argv) - return -ENOMEM; - - if (argc) - ret = createfn(argc, argv); - - argv_free(argv); - - return ret; -} - -#define WRITE_BUFSIZE 4096 - -ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos, - int (*createfn)(int, char **)) -{ - char *kbuf, *buf, *tmp; - int ret = 0; - size_t done = 0; - size_t size; - - kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - - while (done < count) { - size = count - done; - - if (size >= WRITE_BUFSIZE) - size = WRITE_BUFSIZE - 1; - - if (copy_from_user(kbuf, buffer + done, size)) { - ret = -EFAULT; - goto out; - } - kbuf[size] = '\0'; - buf = kbuf; - do { - tmp = strchr(buf, '\n'); - if (tmp) { - *tmp = '\0'; - size = tmp - buf + 1; - } else { - size = strlen(buf); - if (done + size < count) { - if (buf != kbuf) - break; - /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ - pr_warn("Line length is too long: Should be less than %d\n", - WRITE_BUFSIZE - 2); - ret = -EINVAL; - goto out; - } - } - done += size; - - /* Remove comments */ - tmp = strchr(buf, '#'); - - if (tmp) - *tmp = '\0'; - - ret = traceprobe_command(buf, createfn); - if (ret) - goto out; - buf += size; - - } while (done < count); - } - ret = done; - -out: - kfree(kbuf); - - return ret; -} - static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, bool is_return) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 903273c93e61..fb66e3eaa192 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -42,7 +42,6 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 -#define MAX_EVENT_NAME_LEN 64 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ @@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); -extern ssize_t traceprobe_probes_write(struct file *file, - const char __user *buffer, size_t count, loff_t *ppos, - int (*createfn)(int, char**)); - -extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); - /* Sum up total data length for dynamic arraies (strings) */ static nokprobe_inline int __get_data_size(struct trace_probe *tp, struct pt_regs *regs) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4525e0271a53..b34965e62fb9 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -651,7 +651,7 @@ static int probes_open(struct inode *inode, struct file *file) static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe); + return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe); } static const struct file_operations uprobe_events_ops = { -- cgit From 0d7a8325bf3326c92da2d21b4496a9ddde896d4f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:21 -0500 Subject: tracing: Clean up hist_field_flags enum As we add more flags, specifying explicit integers for the flag values becomes more unwieldy and error-prone - switch them over to left-shift values. Link: http://lkml.kernel.org/r/e644e4fb7665aec015f4a2d84a2f990d3dd5b8a1.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 121d56850f24..0c7ec3048624 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -110,16 +110,16 @@ DEFINE_HIST_FIELD_FN(u8); #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) enum hist_field_flags { - HIST_FIELD_FL_HITCOUNT = 1, - HIST_FIELD_FL_KEY = 2, - HIST_FIELD_FL_STRING = 4, - HIST_FIELD_FL_HEX = 8, - HIST_FIELD_FL_SYM = 16, - HIST_FIELD_FL_SYM_OFFSET = 32, - HIST_FIELD_FL_EXECNAME = 64, - HIST_FIELD_FL_SYSCALL = 128, - HIST_FIELD_FL_STACKTRACE = 256, - HIST_FIELD_FL_LOG2 = 512, + HIST_FIELD_FL_HITCOUNT = 1 << 0, + HIST_FIELD_FL_KEY = 1 << 1, + HIST_FIELD_FL_STRING = 1 << 2, + HIST_FIELD_FL_HEX = 1 << 3, + HIST_FIELD_FL_SYM = 1 << 4, + HIST_FIELD_FL_SYM_OFFSET = 1 << 5, + HIST_FIELD_FL_EXECNAME = 1 << 6, + HIST_FIELD_FL_SYSCALL = 1 << 7, + HIST_FIELD_FL_STACKTRACE = 1 << 8, + HIST_FIELD_FL_LOG2 = 1 << 9, }; struct hist_trigger_attrs { -- cgit From 85013256cf01629f72a327674c5d007b4a4b40da Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:22 -0500 Subject: tracing: Add hist_field_name() accessor In preparation for hist_fields that won't be strictly based on trace_event_fields, add a new hist_field_name() accessor to allow that flexibility and update associated users. Link: http://lkml.kernel.org/r/5b5a2d36dde067cbbe2434b10f06daac27b7dbd5.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 67 +++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0c7ec3048624..34edf5fd85fd 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -146,6 +146,23 @@ struct hist_trigger_data { struct tracing_map *map; }; +static const char *hist_field_name(struct hist_field *field, + unsigned int level) +{ + const char *field_name = ""; + + if (level > 1) + return field_name; + + if (field->field) + field_name = field->field->name; + + if (field_name == NULL) + field_name = ""; + + return field_name; +} + static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) { hist_field_fn_t fn = NULL; @@ -653,7 +670,6 @@ static int is_descending(const char *str) static int create_sort_keys(struct hist_trigger_data *hist_data) { char *fields_str = hist_data->attrs->sort_key_str; - struct ftrace_event_field *field = NULL; struct tracing_map_sort_key *sort_key; int descending, ret = 0; unsigned int i, j; @@ -670,7 +686,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) } for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) { + struct hist_field *hist_field; char *field_str, *field_name; + const char *test_name; sort_key = &hist_data->sort_keys[i]; @@ -703,8 +721,10 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) } for (j = 1; j < hist_data->n_fields; j++) { - field = hist_data->fields[j]->field; - if (field && (strcmp(field_name, field->name) == 0)) { + hist_field = hist_data->fields[j]; + test_name = hist_field_name(hist_field, 0); + + if (strcmp(field_name, test_name) == 0) { sort_key->field_idx = j; descending = is_descending(field_str); if (descending < 0) { @@ -952,6 +972,7 @@ hist_trigger_entry_print(struct seq_file *m, struct hist_field *key_field; char str[KSYM_SYMBOL_LEN]; bool multiline = false; + const char *field_name; unsigned int i; u64 uval; @@ -963,26 +984,27 @@ hist_trigger_entry_print(struct seq_file *m, if (i > hist_data->n_vals) seq_puts(m, ", "); + field_name = hist_field_name(key_field, 0); + if (key_field->flags & HIST_FIELD_FL_HEX) { uval = *(u64 *)(key + key_field->offset); - seq_printf(m, "%s: %llx", - key_field->field->name, uval); + seq_printf(m, "%s: %llx", field_name, uval); } else if (key_field->flags & HIST_FIELD_FL_SYM) { uval = *(u64 *)(key + key_field->offset); sprint_symbol_no_offset(str, uval); - seq_printf(m, "%s: [%llx] %-45s", - key_field->field->name, uval, str); + seq_printf(m, "%s: [%llx] %-45s", field_name, + uval, str); } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { uval = *(u64 *)(key + key_field->offset); sprint_symbol(str, uval); - seq_printf(m, "%s: [%llx] %-55s", - key_field->field->name, uval, str); + seq_printf(m, "%s: [%llx] %-55s", field_name, + uval, str); } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { char *comm = elt->private_data; uval = *(u64 *)(key + key_field->offset); - seq_printf(m, "%s: %-16s[%10llu]", - key_field->field->name, comm, uval); + seq_printf(m, "%s: %-16s[%10llu]", field_name, + comm, uval); } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) { const char *syscall_name; @@ -991,8 +1013,8 @@ hist_trigger_entry_print(struct seq_file *m, if (!syscall_name) syscall_name = "unknown_syscall"; - seq_printf(m, "%s: %-30s[%3llu]", - key_field->field->name, syscall_name, uval); + seq_printf(m, "%s: %-30s[%3llu]", field_name, + syscall_name, uval); } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { seq_puts(m, "stacktrace:\n"); hist_trigger_stacktrace_print(m, @@ -1000,15 +1022,14 @@ hist_trigger_entry_print(struct seq_file *m, HIST_STACKTRACE_DEPTH); multiline = true; } else if (key_field->flags & HIST_FIELD_FL_LOG2) { - seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name, + seq_printf(m, "%s: ~ 2^%-2llu", field_name, *(u64 *)(key + key_field->offset)); } else if (key_field->flags & HIST_FIELD_FL_STRING) { - seq_printf(m, "%s: %-50s", key_field->field->name, + seq_printf(m, "%s: %-50s", field_name, (char *)(key + key_field->offset)); } else { uval = *(u64 *)(key + key_field->offset); - seq_printf(m, "%s: %10llu", key_field->field->name, - uval); + seq_printf(m, "%s: %10llu", field_name, uval); } } @@ -1021,13 +1042,13 @@ hist_trigger_entry_print(struct seq_file *m, tracing_map_read_sum(elt, HITCOUNT_IDX)); for (i = 1; i < hist_data->n_vals; i++) { + field_name = hist_field_name(hist_data->fields[i], 0); + if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { - seq_printf(m, " %s: %10llx", - hist_data->fields[i]->field->name, + seq_printf(m, " %s: %10llx", field_name, tracing_map_read_sum(elt, i)); } else { - seq_printf(m, " %s: %10llu", - hist_data->fields[i]->field->name, + seq_printf(m, " %s: %10llu", field_name, tracing_map_read_sum(elt, i)); } } @@ -1140,7 +1161,9 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) { - seq_printf(m, "%s", hist_field->field->name); + const char *field_name = hist_field_name(hist_field, 0); + + seq_printf(m, "%s", field_name); if (hist_field->flags) { const char *flags_str = get_hist_field_flags(hist_field); -- cgit From 5819eaddf35b24d628ddfa4fbb5f8d4026e44b96 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 22 Sep 2017 14:58:23 -0500 Subject: tracing: Reimplement log2 log2 as currently implemented applies only to u64 trace_event_field derived fields, and assumes that anything it's applied to is a u64 field. To prepare for synthetic fields like latencies, log2 should be applicable to those as well, so take the opportunity now to fix the current problems as well as expand to more general uses. log2 should be thought of as a chaining function rather than a field type. To enable this as well as possible future function implementations, add a hist_field operand array into the hist_field definition for this purpose, and make use of it to implement the log2 'function'. Link: http://lkml.kernel.org/r/b47f93fc0b87b36eccf716b0c018f3a71e1f1111.1506105045.git.tom.zanussi@linux.intel.com Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 34edf5fd85fd..1e1558c99d56 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -28,12 +28,16 @@ struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); +#define HIST_FIELD_OPERANDS_MAX 2 + struct hist_field { struct ftrace_event_field *field; unsigned long flags; hist_field_fn_t fn; unsigned int size; unsigned int offset; + unsigned int is_signed; + struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; }; static u64 hist_field_none(struct hist_field *field, void *event) @@ -71,7 +75,9 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event) static u64 hist_field_log2(struct hist_field *hist_field, void *event) { - u64 val = *(u64 *)(event + hist_field->field->offset); + struct hist_field *operand = hist_field->operands[0]; + + u64 val = operand->fn(operand, event); return (u64) ilog2(roundup_pow_of_two(val)); } @@ -156,6 +162,8 @@ static const char *hist_field_name(struct hist_field *field, if (field->field) field_name = field->field->name; + else if (field->flags & HIST_FIELD_FL_LOG2) + field_name = hist_field_name(field->operands[0], ++level); if (field_name == NULL) field_name = ""; @@ -357,8 +365,20 @@ static const struct tracing_map_ops hist_trigger_elt_comm_ops = { .elt_init = hist_trigger_elt_comm_init, }; -static void destroy_hist_field(struct hist_field *hist_field) +static void destroy_hist_field(struct hist_field *hist_field, + unsigned int level) { + unsigned int i; + + if (level > 2) + return; + + if (!hist_field) + return; + + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) + destroy_hist_field(hist_field->operands[i], level + 1); + kfree(hist_field); } @@ -385,7 +405,10 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, } if (flags & HIST_FIELD_FL_LOG2) { + unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; hist_field->fn = hist_field_log2; + hist_field->operands[0] = create_hist_field(field, fl); + hist_field->size = hist_field->operands[0]->size; goto out; } @@ -405,7 +428,7 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field, hist_field->fn = select_value_fn(field->size, field->is_signed); if (!hist_field->fn) { - destroy_hist_field(hist_field); + destroy_hist_field(hist_field, 0); return NULL; } } @@ -422,7 +445,7 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data) for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { if (hist_data->fields[i]) { - destroy_hist_field(hist_data->fields[i]); + destroy_hist_field(hist_data->fields[i], 0); hist_data->fields[i] = NULL; } } -- cgit From b35bd0d9f8a8ea17aae40893e18274d191a2d2c5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 30 Sep 2017 23:39:05 -0600 Subject: sysctl: remove /proc/sys/vm/nr_pdflush_threads This tunable has been obsolete since 2.6.32, and writes to the file have been failing and complaining in dmesg since then: nr_pdflush_threads exported in /proc is scheduled for removal That was 8 years ago. Remove the file ABI obsolete notice, and the sysfs file. Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- kernel/sysctl.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbbb8157..a5dd8d82c253 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1344,11 +1344,6 @@ static struct ctl_table vm_table[] = { .proc_handler = dirtytime_interval_handler, .extra1 = &zero, }, - { - .procname = "nr_pdflush_threads", - .mode = 0444 /* read-only */, - .proc_handler = pdflush_proc_obsolete, - }, { .procname = "swappiness", .data = &vm_swappiness, -- cgit From 6cafbe159416822f6d3dfd711bf4c39050c650ba Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 20 Jun 2017 10:44:58 -0400 Subject: ftrace: Add a ftrace_free_mem() function for modules to use In order to be able to trace module init functions, the module code needs to tell ftrace what is being freed when the init sections are freed. Use the code that the main init calls to tell ftrace to free the main init sections. This requires passing in a start and end address to free. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6abfafd7f173..84cb5928665a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5868,10 +5868,10 @@ void ftrace_module_init(struct module *mod) } #endif /* CONFIG_MODULES */ -void __init ftrace_free_init_mem(void) +void ftrace_free_mem(void *start_ptr, void *end_ptr) { - unsigned long start = (unsigned long)(&__init_begin); - unsigned long end = (unsigned long)(&__init_end); + unsigned long start = (unsigned long)(start_ptr); + unsigned long end = (unsigned long)(end_ptr); struct ftrace_page **last_pg = &ftrace_pages_start; struct ftrace_page *pg; struct dyn_ftrace *rec; @@ -5913,6 +5913,14 @@ void __init ftrace_free_init_mem(void) mutex_unlock(&ftrace_lock); } +void __init ftrace_free_init_mem(void) +{ + void *start = (void *)(&__init_begin); + void *end = (void *)(&__init_end); + + ftrace_free_mem(start, end); +} + void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; -- cgit From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:21 -0700 Subject: bpf: multi program support for cgroup+bpf introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple bpf programs to a cgroup. The difference between three possible flags for BPF_PROG_ATTACH command: - NONE(default): No further bpf programs allowed in the subtree. - BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, the program in this cgroup yields to sub-cgroup program. - BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, that cgroup program gets run in addition to the program in this cgroup. NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't change their behavior. It only clarifies the semantics in relation to new flag. Only one program is allowed to be attached to a cgroup with NONE or BPF_F_ALLOW_OVERRIDE flag. Multiple programs are allowed to be attached to a cgroup with BPF_F_ALLOW_MULTI flag. They are executed in FIFO order (those that were attached first, run first) The programs of sub-cgroup are executed first, then programs of this cgroup and then programs of parent cgroup. All eligible programs are executed regardless of return code from earlier programs. To allow efficient execution of multiple programs attached to a cgroup and to avoid penalizing cgroups without any programs attached introduce 'struct bpf_prog_array' which is RCU protected array of pointers to bpf programs. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 467 +++++++++++++++++++++++++++++++++++-------------- kernel/bpf/core.c | 31 ++++ kernel/bpf/syscall.c | 37 ++-- kernel/cgroup/cgroup.c | 28 ++- 4 files changed, 414 insertions(+), 149 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 546113430049..6b7500bbdb53 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -27,129 +27,361 @@ void cgroup_bpf_put(struct cgroup *cgrp) { unsigned int type; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { - struct bpf_prog *prog = cgrp->bpf.prog[type]; - - if (prog) { - bpf_prog_put(prog); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog_list *pl, *tmp; + + list_for_each_entry_safe(pl, tmp, progs, node) { + list_del(&pl->node); + bpf_prog_put(pl->prog); + kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } + bpf_prog_array_free(cgrp->bpf.effective[type]); + } +} + +/* count number of elements in the list. + * it's slow but the list cannot be long + */ +static u32 prog_list_length(struct list_head *head) +{ + struct bpf_prog_list *pl; + u32 cnt = 0; + + list_for_each_entry(pl, head, node) { + if (!pl->prog) + continue; + cnt++; } + return cnt; +} + +/* if parent has non-overridable prog attached, + * disallow attaching new programs to the descendent cgroup. + * if parent has overridable or multi-prog, allow attaching + */ +static bool hierarchy_allows_attach(struct cgroup *cgrp, + enum bpf_attach_type type, + u32 new_flags) +{ + struct cgroup *p; + + p = cgroup_parent(cgrp); + if (!p) + return true; + do { + u32 flags = p->bpf.flags[type]; + u32 cnt; + + if (flags & BPF_F_ALLOW_MULTI) + return true; + cnt = prog_list_length(&p->bpf.progs[type]); + WARN_ON_ONCE(cnt > 1); + if (cnt == 1) + return !!(flags & BPF_F_ALLOW_OVERRIDE); + p = cgroup_parent(p); + } while (p); + return true; +} + +/* compute a chain of effective programs for a given cgroup: + * start from the list of programs in this cgroup and add + * all parent programs. + * Note that parent's F_ALLOW_OVERRIDE-type program is yielding + * to programs in this cgroup + */ +static int compute_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu **array) +{ + struct bpf_prog_array __rcu *progs; + struct bpf_prog_list *pl; + struct cgroup *p = cgrp; + int cnt = 0; + + /* count number of effective programs by walking parents */ + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + cnt += prog_list_length(&p->bpf.progs[type]); + p = cgroup_parent(p); + } while (p); + + progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); + if (!progs) + return -ENOMEM; + + /* populate the array with effective progs */ + cnt = 0; + p = cgrp; + do { + if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) + list_for_each_entry(pl, + &p->bpf.progs[type], node) { + if (!pl->prog) + continue; + rcu_dereference_protected(progs, 1)-> + progs[cnt++] = pl->prog; + } + p = cgroup_parent(p); + } while (p); + + *array = progs; + return 0; +} + +static void activate_effective_progs(struct cgroup *cgrp, + enum bpf_attach_type type, + struct bpf_prog_array __rcu *array) +{ + struct bpf_prog_array __rcu *old_array; + + old_array = xchg(&cgrp->bpf.effective[type], array); + /* free prog array after grace period, since __cgroup_bpf_run_*() + * might be still walking the array + */ + bpf_prog_array_free(old_array); } /** * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify - * @parent: the parent to inherit from */ -void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) +int cgroup_bpf_inherit(struct cgroup *cgrp) { - unsigned int type; +/* has to use marco instead of const int, since compiler thinks + * that array below is variable length + */ +#define NR ARRAY_SIZE(cgrp->bpf.effective) + struct bpf_prog_array __rcu *arrays[NR] = {}; + int i; - for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { - struct bpf_prog *e; + for (i = 0; i < NR; i++) + INIT_LIST_HEAD(&cgrp->bpf.progs[i]); - e = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - rcu_assign_pointer(cgrp->bpf.effective[type], e); - cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type]; - } + for (i = 0; i < NR; i++) + if (compute_effective_progs(cgrp, i, &arrays[i])) + goto cleanup; + + for (i = 0; i < NR; i++) + activate_effective_progs(cgrp, i, arrays[i]); + + return 0; +cleanup: + for (i = 0; i < NR; i++) + bpf_prog_array_free(arrays[i]); + return -ENOMEM; } +#define BPF_CGROUP_MAX_PROGS 64 + /** - * __cgroup_bpf_update() - Update the pinned program of a cgroup, and + * __cgroup_bpf_attach() - Attach the program to a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse - * @parent: The parent of @cgrp, or %NULL if @cgrp is the root - * @prog: A new program to pin - * @type: Type of pinning operation (ingress/egress) - * - * Each cgroup has a set of two pointers for bpf programs; one for eBPF - * programs it owns, and which is effective for execution. - * - * If @prog is not %NULL, this function attaches a new program to the cgroup - * and releases the one that is currently attached, if any. @prog is then made - * the effective program of type @type in that cgroup. - * - * If @prog is %NULL, the currently attached program of type @type is released, - * and the effective program of the parent cgroup (if any) is inherited to - * @cgrp. - * - * Then, the descendants of @cgrp are walked and the effective program for - * each of them is set to the effective program of @cgrp unless the - * descendant has its own program attached, in which case the subbranch is - * skipped. This ensures that delegated subcgroups with own programs are left - * untouched. + * @prog: A program to attach + * @type: Type of attach operation * * Must be called with cgroup_mutex held. */ -int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, - struct bpf_prog *prog, enum bpf_attach_type type, - bool new_overridable) +int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) { - struct bpf_prog *old_prog, *effective = NULL; - struct cgroup_subsys_state *pos; - bool overridable = true; - - if (parent) { - overridable = !parent->bpf.disallow_override[type]; - effective = rcu_dereference_protected(parent->bpf.effective[type], - lockdep_is_held(&cgroup_mutex)); - } - - if (prog && effective && !overridable) - /* if parent has non-overridable prog attached, disallow - * attaching new programs to descendent cgroup - */ + struct list_head *progs = &cgrp->bpf.progs[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + bool pl_was_allocated; + u32 old_flags; + int err; + + if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) + /* invalid combination */ + return -EINVAL; + + if (!hierarchy_allows_attach(cgrp, type, flags)) return -EPERM; - if (prog && effective && overridable != new_overridable) - /* if parent has overridable prog attached, only - * allow overridable programs in descendent cgroup + if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) + /* Disallow attaching non-overridable on top + * of existing overridable in this cgroup. + * Disallow attaching multi-prog if overridable or none */ return -EPERM; - old_prog = cgrp->bpf.prog[type]; - - if (prog) { - overridable = new_overridable; - effective = prog; - if (old_prog && - cgrp->bpf.disallow_override[type] == new_overridable) - /* disallow attaching non-overridable on top - * of existing overridable in this cgroup - * and vice versa - */ - return -EPERM; + if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + return -E2BIG; + + if (flags & BPF_F_ALLOW_MULTI) { + list_for_each_entry(pl, progs, node) + if (pl->prog == prog) + /* disallow attaching the same prog twice */ + return -EINVAL; + + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + pl->prog = prog; + list_add_tail(&pl->node, progs); + } else { + if (list_empty(progs)) { + pl = kmalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return -ENOMEM; + pl_was_allocated = true; + list_add_tail(&pl->node, progs); + } else { + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl_was_allocated = false; + } + pl->prog = prog; } - if (!prog && !old_prog) - /* report error when trying to detach and nothing is attached */ - return -ENOENT; + old_flags = cgrp->bpf.flags[type]; + cgrp->bpf.flags[type] = flags; - cgrp->bpf.prog[type] = prog; + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); - css_for_each_descendant_pre(pos, &cgrp->self) { - struct cgroup *desc = container_of(pos, struct cgroup, self); - - /* skip the subtree if the descendant has its own program */ - if (desc->bpf.prog[type] && desc != cgrp) { - pos = css_rightmost_descendant(pos); - } else { - rcu_assign_pointer(desc->bpf.effective[type], - effective); - desc->bpf.disallow_override[type] = !overridable; - } + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; } - if (prog) - static_branch_inc(&cgroup_bpf_enabled_key); + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + static_branch_inc(&cgroup_bpf_enabled_key); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and cleanup the prog list */ + pl->prog = old_prog; + if (pl_was_allocated) { + list_del(&pl->node); + kfree(pl); + } + return err; +} + +/** + * __cgroup_bpf_detach() - Detach the program from a cgroup, and + * propagate the change to descendants + * @cgrp: The cgroup which descendants to traverse + * @prog: A program to detach or NULL + * @type: Type of detach operation + * + * Must be called with cgroup_mutex held. + */ +int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 unused_flags) +{ + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog *old_prog = NULL; + struct cgroup_subsys_state *css; + struct bpf_prog_list *pl; + int err; + + if (flags & BPF_F_ALLOW_MULTI) { + if (!prog) + /* to detach MULTI prog the user has to specify valid FD + * of the program to be detached + */ + return -EINVAL; + } else { + if (list_empty(progs)) + /* report error when trying to detach and nothing is attached */ + return -ENOENT; + } + + if (flags & BPF_F_ALLOW_MULTI) { + /* find the prog and detach it */ + list_for_each_entry(pl, progs, node) { + if (pl->prog != prog) + continue; + old_prog = prog; + /* mark it deleted, so it's ignored while + * recomputing effective + */ + pl->prog = NULL; + break; + } + if (!old_prog) + return -ENOENT; + } else { + /* to maintain backward compatibility NONE and OVERRIDE cgroups + * allow detaching with invalid FD (prog==NULL) + */ + pl = list_first_entry(progs, typeof(*pl), node); + old_prog = pl->prog; + pl->prog = NULL; + } + + /* allocate and recompute effective prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + err = compute_effective_progs(desc, type, &desc->bpf.inactive); + if (err) + goto cleanup; + } + + /* all allocations were successful. Activate all prog arrays */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + activate_effective_progs(desc, type, desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* now can actually delete it from this cgroup list */ + list_del(&pl->node); + kfree(pl); + if (list_empty(progs)) + /* last program was detached, reset flags to zero */ + cgrp->bpf.flags[type] = 0; + + bpf_prog_put(old_prog); + static_branch_dec(&cgroup_bpf_enabled_key); + return 0; + +cleanup: + /* oom while computing effective. Free all computed effective arrays + * since they were not activated + */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + + /* and restore back old_prog */ + pl->prog = old_prog; + return err; } /** @@ -171,36 +403,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type) { - struct bpf_prog *prog; + unsigned int offset = skb->data - skb_network_header(skb); + struct sock *save_sk; struct cgroup *cgrp; - int ret = 0; + int ret; if (!sk || !sk_fullsock(sk)) return 0; - if (sk->sk_family != AF_INET && - sk->sk_family != AF_INET6) + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) { - unsigned int offset = skb->data - skb_network_header(skb); - struct sock *save_sk = skb->sk; - - skb->sk = sk; - __skb_push(skb, offset); - ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; - __skb_pull(skb, offset); - skb->sk = save_sk; - } - - rcu_read_unlock(); - - return ret; + save_sk = skb->sk; + skb->sk = sk; + __skb_push(skb, offset); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + bpf_prog_run_save_cb); + __skb_pull(skb, offset); + skb->sk = save_sk; + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); @@ -221,19 +443,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; - - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; + int ret; - rcu_read_unlock(); - - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -258,18 +471,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - struct bpf_prog *prog; - int ret = 0; - - - rcu_read_lock(); - - prog = rcu_dereference(cgrp->bpf.effective[type]); - if (prog) - ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM; - - rcu_read_unlock(); + int ret; - return ret; + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, + BPF_PROG_RUN); + return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 917cc04a0a94..6b49e1991ae7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1381,6 +1381,37 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); +/* to avoid allocating empty bpf_prog_array for cgroups that + * don't have bpf program attached use one global 'empty_prog_array' + * It will not be modified the caller of bpf_prog_array_alloc() + * (since caller requested prog_cnt == 0) + * that pointer should be 'freed' by bpf_prog_array_free() + */ +static struct { + struct bpf_prog_array hdr; + struct bpf_prog *null_prog; +} empty_prog_array = { + .null_prog = NULL, +}; + +struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +{ + if (prog_cnt) + return kzalloc(sizeof(struct bpf_prog_array) + + sizeof(struct bpf_prog *) * (prog_cnt + 1), + flags); + + return &empty_prog_array.hdr; +} + +void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +{ + if (!progs || + progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) + return; + kfree_rcu(progs, rcu); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b927da66f653..51bee695d32c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1168,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return 0; } +#define BPF_F_ATTACH_MASK \ + (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) + static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; @@ -1181,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; - if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE) + if (attr->attach_flags & ~BPF_F_ATTACH_MASK) return -EINVAL; switch (attr->attach_type) { @@ -1212,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) return PTR_ERR(cgrp); } - ret = cgroup_bpf_update(cgrp, prog, attr->attach_type, - attr->attach_flags & BPF_F_ALLOW_OVERRIDE); + ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, + attr->attach_flags); if (ret) bpf_prog_put(prog); cgroup_put(cgrp); @@ -1225,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) static int bpf_prog_detach(const union bpf_attr *attr) { + enum bpf_prog_type ptype; + struct bpf_prog *prog; struct cgroup *cgrp; int ret; @@ -1237,23 +1242,33 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (attr->attach_type) { case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: + ptype = BPF_PROG_TYPE_CGROUP_SKB; + break; case BPF_CGROUP_INET_SOCK_CREATE: + ptype = BPF_PROG_TYPE_CGROUP_SOCK; + break; case BPF_CGROUP_SOCK_OPS: - cgrp = cgroup_get_from_fd(attr->target_fd); - if (IS_ERR(cgrp)) - return PTR_ERR(cgrp); - - ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false); - cgroup_put(cgrp); + ptype = BPF_PROG_TYPE_SOCK_OPS; break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - ret = sockmap_get_from_fd(attr, false); - break; + return sockmap_get_from_fd(attr, false); default: return -EINVAL; } + cgrp = cgroup_get_from_fd(attr->target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + prog = NULL; + + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); + if (prog) + bpf_prog_put(prog); + cgroup_put(cgrp); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6551cd45238..57eb866ae78d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) if (ret) goto destroy_root; + ret = cgroup_bpf_inherit(root_cgrp); + WARN_ON_ONCE(ret); + trace_cgroup_setup_root(root); /* @@ -4713,6 +4716,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent) cgrp->self.parent = &parent->self; cgrp->root = root; cgrp->level = level; + ret = cgroup_bpf_inherit(cgrp); + if (ret) + goto out_idr_free; for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; @@ -4747,13 +4753,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (!cgroup_on_dfl(cgrp)) cgrp->subtree_control = cgroup_control(cgrp); - if (parent) - cgroup_bpf_inherit(cgrp, parent); - cgroup_propagate_control(cgrp); return cgrp; +out_idr_free: + cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5736,14 +5741,23 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd) #endif /* CONFIG_SOCK_CGROUP_DATA */ #ifdef CONFIG_CGROUP_BPF -int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, bool overridable) +int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_attach(cgrp, prog, type, flags); + mutex_unlock(&cgroup_mutex); + return ret; +} +int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, + enum bpf_attach_type type, u32 flags) { - struct cgroup *parent = cgroup_parent(cgrp); int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable); + ret = __cgroup_bpf_detach(cgrp, prog, type, flags); mutex_unlock(&cgroup_mutex); return ret; } -- cgit From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:22 -0700 Subject: bpf: introduce BPF_PROG_QUERY command introduce BPF_PROG_QUERY command to retrieve a set of either attached programs to given cgroup or a set of effective programs that will execute for events within a cgroup Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau for cgroup bits Acked-by: Tejun Heo Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 38 ++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++++++++ kernel/cgroup/cgroup.c | 10 ++++++++++ 4 files changed, 128 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6b7500bbdb53..e88abc0865d5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -384,6 +384,52 @@ cleanup: return err; } +/* Must be called with cgroup_mutex held to avoid races. */ +int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + enum bpf_attach_type type = attr->query.attach_type; + struct list_head *progs = &cgrp->bpf.progs[type]; + u32 flags = cgrp->bpf.flags[type]; + int cnt, ret = 0, i; + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) + cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + else + cnt = prog_list_length(progs); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) + /* return early if user requested only program count + flags */ + return 0; + if (attr->query.prog_cnt < cnt) { + cnt = attr->query.prog_cnt; + ret = -ENOSPC; + } + + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { + return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], + prog_ids, cnt); + } else { + struct bpf_prog_list *pl; + u32 id; + + i = 0; + list_for_each_entry(pl, progs, node) { + id = pl->prog->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) + return -EFAULT; + if (++i == cnt) + break; + } + } + return ret; +} + /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6b49e1991ae7..eba966c09053 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1412,6 +1412,44 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) kfree_rcu(progs, rcu); } +int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) +{ + struct bpf_prog **prog; + u32 cnt = 0; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) + cnt++; + rcu_read_unlock(); + return cnt; +} + +int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, + __u32 __user *prog_ids, u32 cnt) +{ + struct bpf_prog **prog; + u32 i = 0, id; + + rcu_read_lock(); + prog = rcu_dereference(progs)->progs; + for (; *prog; prog++) { + id = (*prog)->aux->id; + if (copy_to_user(prog_ids + i, &id, sizeof(id))) { + rcu_read_unlock(); + return -EFAULT; + } + if (++i == cnt) { + prog++; + break; + } + } + rcu_read_unlock(); + if (*prog) + return -ENOSPC; + return 0; +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 51bee695d32c..0048cb24ba7b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1272,6 +1272,37 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } +#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt + +static int bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct cgroup *cgrp; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (CHECK_ATTR(BPF_PROG_QUERY)) + return -EINVAL; + if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE) + return -EINVAL; + + switch (attr->query.attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_SOCK_OPS: + break; + default: + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->query.target_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + ret = cgroup_bpf_query(cgrp, attr, uattr); + cgroup_put(cgrp); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration @@ -1568,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_PROG_DETACH: err = bpf_prog_detach(&attr); break; + case BPF_PROG_QUERY: + err = bpf_prog_query(&attr, uattr); + break; #endif case BPF_PROG_TEST_RUN: err = bpf_prog_test_run(&attr, uattr); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 57eb866ae78d..269512b94a94 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5761,4 +5761,14 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, mutex_unlock(&cgroup_mutex); return ret; } +int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = __cgroup_bpf_query(cgrp, attr, uattr); + mutex_unlock(&cgroup_mutex); + return ret; +} #endif /* CONFIG_CGROUP_BPF */ -- cgit From 390ee7e29fc8e6e90d3065b00afb047c4ee552f9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 2 Oct 2017 22:50:23 -0700 Subject: bpf: enforce return code for cgroup-bpf programs with addition of tnum logic the verifier got smart enough and we can enforce return codes at program load time. For now do so for cgroup-bpf program types. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4cf9b72c59a0..52b022310f6a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3073,6 +3073,43 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static int check_return_code(struct bpf_verifier_env *env) +{ + struct bpf_reg_state *reg; + struct tnum range = tnum_range(0, 1); + + switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_SOCK_OPS: + break; + default: + return 0; + } + + reg = &env->cur_state.regs[BPF_REG_0]; + if (reg->type != SCALAR_VALUE) { + verbose("At program exit the register R0 is not a known value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + + if (!tnum_in(range, reg->var_off)) { + verbose("At program exit the register R0 "); + if (!tnum_is_unknown(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("has value %s", tn_buf); + } else { + verbose("has unknown scalar value"); + } + verbose(" should have been 0 or 1\n"); + return -EINVAL; + } + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -3863,6 +3900,9 @@ static int do_check(struct bpf_verifier_env *env) return -EACCES; } + err = check_return_code(env); + if (err) + return err; process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { -- cgit From 3e234289f86b12985ef8909cd34525fcb66c4efb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 3 Mar 2017 18:00:22 -0500 Subject: ftrace: Allow module init functions to be traced Allow for module init sections to be traced as well as core kernel init sections. Now that filtering modules functions can be stored, for when they are loaded, it makes sense to be able to trace them. Cc: Jessica Yu Cc: Rusty Russell Signed-off-by: Steven Rostedt (VMware) --- kernel/module.c | 2 ++ kernel/trace/ftrace.c | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index de66ec825992..58bca427ac3f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3473,6 +3473,8 @@ static noinline int do_init_module(struct module *mod) if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) async_synchronize_full(); + ftrace_free_mem(mod->init_layout.base, mod->init_layout.base + + mod->init_layout.size); mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 84cb5928665a..d7297e866e4a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5752,7 +5752,8 @@ void ftrace_release_mod(struct module *mod) last_pg = &ftrace_pages_start; for (pg = ftrace_pages_start; pg; pg = *last_pg) { rec = &pg->records[0]; - if (within_module_core(rec->ip, mod)) { + if (within_module_core(rec->ip, mod) || + within_module_init(rec->ip, mod)) { /* * As core pages are first, the first * page should never be a module page. @@ -5821,7 +5822,8 @@ void ftrace_module_enable(struct module *mod) * not part of this module, then skip this pg, * which the "break" will do. */ - if (!within_module_core(rec->ip, mod)) + if (!within_module_core(rec->ip, mod) && + !within_module_init(rec->ip, mod)) break; cnt = 0; -- cgit From aba4b5c22cbac296f4081a0476d0c55828f135b4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 1 Sep 2017 08:35:38 -0400 Subject: ftrace: Save module init functions kallsyms symbols for tracing If function tracing is active when the module init functions are freed, then store them to be referenced by kallsyms. As module init functions can now be traced on module load, they were useless: ># echo ':mod:snd_seq' > set_ftrace_filter ># echo function > current_tracer ># modprobe snd_seq ># cat trace # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | modprobe-2786 [000] .... 3189.037874: 0xffffffffa0860000 <-do_one_initcall modprobe-2786 [000] .... 3189.037876: 0xffffffffa086004d <-0xffffffffa086000f modprobe-2786 [000] .... 3189.037876: 0xffffffffa086010d <-0xffffffffa0860018 modprobe-2786 [000] .... 3189.037877: 0xffffffffa086011a <-0xffffffffa0860021 modprobe-2786 [000] .... 3189.037877: 0xffffffffa0860080 <-0xffffffffa086002a modprobe-2786 [000] .... 3189.039523: 0xffffffffa0860400 <-0xffffffffa0860033 modprobe-2786 [000] .... 3189.039523: 0xffffffffa086038a <-0xffffffffa086041c modprobe-2786 [000] .... 3189.039591: 0xffffffffa086038a <-0xffffffffa0860436 modprobe-2786 [000] .... 3189.039657: 0xffffffffa086038a <-0xffffffffa0860450 modprobe-2786 [000] .... 3189.039719: 0xffffffffa0860127 <-0xffffffffa086003c modprobe-2786 [000] .... 3189.039742: snd_seq_create_kernel_client <-0xffffffffa08601f6 When the output is shown, the kallsyms for the module init functions have already been freed, and the output of the trace can not convert them to their function names. Now this looks like this: # tracer: function # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | modprobe-2463 [002] .... 174.243237: alsa_seq_init <-do_one_initcall modprobe-2463 [002] .... 174.243239: client_init_data <-alsa_seq_init modprobe-2463 [002] .... 174.243240: snd_sequencer_memory_init <-alsa_seq_init modprobe-2463 [002] .... 174.243240: snd_seq_queues_init <-alsa_seq_init modprobe-2463 [002] .... 174.243240: snd_sequencer_device_init <-alsa_seq_init modprobe-2463 [002] .... 174.244860: snd_seq_info_init <-alsa_seq_init modprobe-2463 [002] .... 174.244861: create_info_entry <-snd_seq_info_init modprobe-2463 [002] .... 174.244936: create_info_entry <-snd_seq_info_init modprobe-2463 [002] .... 174.245003: create_info_entry <-snd_seq_info_init modprobe-2463 [002] .... 174.245072: snd_seq_system_client_init <-alsa_seq_init modprobe-2463 [002] .... 174.245094: snd_seq_create_kernel_client <-snd_seq_system_client_init Signed-off-by: Steven Rostedt (VMware) --- kernel/kallsyms.c | 5 ++ kernel/module.c | 2 +- kernel/trace/ftrace.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 150 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 127e7cfafa55..976ecb9275d9 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -337,6 +338,10 @@ const char *kallsyms_lookup(unsigned long addr, if (!ret) ret = bpf_address_lookup(addr, symbolsize, offset, modname, namebuf); + + if (!ret) + ret = ftrace_mod_address_lookup(addr, symbolsize, + offset, modname, namebuf); return ret; } diff --git a/kernel/module.c b/kernel/module.c index 58bca427ac3f..279a469dc375 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3473,7 +3473,7 @@ static noinline int do_init_module(struct module *mod) if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) async_synchronize_full(); - ftrace_free_mem(mod->init_layout.base, mod->init_layout.base + + ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + mod->init_layout.size); mutex_lock(&module_mutex); /* Drop initial reference. */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d7297e866e4a..86dbbfb353db 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5675,6 +5675,21 @@ static int ftrace_process_locs(struct module *mod, return ret; } +struct ftrace_mod_func { + struct list_head list; + char *name; + unsigned long ip; + unsigned int size; +}; + +struct ftrace_mod_map { + struct list_head list; + struct module *mod; + unsigned long start_addr; + unsigned long end_addr; + struct list_head funcs; +}; + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -5868,9 +5883,123 @@ void ftrace_module_init(struct module *mod) ftrace_process_locs(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); } + +static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, + struct dyn_ftrace *rec) +{ + struct ftrace_mod_func *mod_func; + unsigned long symsize; + unsigned long offset; + char str[KSYM_SYMBOL_LEN]; + char *modname; + const char *ret; + + ret = kallsyms_lookup(rec->ip, &symsize, &offset, &modname, str); + if (!ret) + return; + + mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL); + if (!mod_func) + return; + + mod_func->name = kstrdup(str, GFP_KERNEL); + if (!mod_func->name) { + kfree(mod_func); + return; + } + + mod_func->ip = rec->ip - offset; + mod_func->size = symsize; + + list_add_rcu(&mod_func->list, &mod_map->funcs); +} + +static LIST_HEAD(ftrace_mod_maps); + +static struct ftrace_mod_map * +allocate_ftrace_mod_map(struct module *mod, + unsigned long start, unsigned long end) +{ + struct ftrace_mod_map *mod_map; + + mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL); + if (!mod_map) + return NULL; + + mod_map->mod = mod; + mod_map->start_addr = start; + mod_map->end_addr = end; + + INIT_LIST_HEAD_RCU(&mod_map->funcs); + + list_add_rcu(&mod_map->list, &ftrace_mod_maps); + + return mod_map; +} + +static const char * +ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, + unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) +{ + struct ftrace_mod_func *found_func = NULL; + struct ftrace_mod_func *mod_func; + + list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) { + if (addr >= mod_func->ip && + addr < mod_func->ip + mod_func->size) { + found_func = mod_func; + break; + } + } + + if (found_func) { + if (size) + *size = found_func->size; + if (off) + *off = addr - found_func->ip; + if (sym) + strlcpy(sym, found_func->name, KSYM_NAME_LEN); + + return found_func->name; + } + + return NULL; +} + +const char * +ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char **modname, char *sym) +{ + struct ftrace_mod_map *mod_map; + const char *ret = NULL; + + preempt_disable(); + list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { + ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym); + if (ret) { + if (modname) + *modname = mod_map->mod->name; + break; + } + } + preempt_enable(); + + return ret; +} + +#else +static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, + struct dyn_ftrace *rec) { } +static inline struct ftrace_mod_map * +allocate_ftrace_mod_map(struct module *mod, + unsigned long start, unsigned long end) +{ + return NULL; +} #endif /* CONFIG_MODULES */ -void ftrace_free_mem(void *start_ptr, void *end_ptr) +void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) { unsigned long start = (unsigned long)(start_ptr); unsigned long end = (unsigned long)(end_ptr); @@ -5878,6 +6007,7 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr) struct ftrace_page *pg; struct dyn_ftrace *rec; struct dyn_ftrace key; + struct ftrace_mod_map *mod_map = NULL; int order; key.ip = start; @@ -5885,6 +6015,14 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr) mutex_lock(&ftrace_lock); + /* + * If we are freeing module init memory, then check if + * any tracer is active. If so, we need to save a mapping of + * the module functions being freed with the address. + */ + if (mod && ftrace_ops_list != &ftrace_list_end) + mod_map = allocate_ftrace_mod_map(mod, start, end); + for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) { if (end < pg->records[0].ip || start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) @@ -5895,6 +6033,10 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr) ftrace_cmp_recs); if (!rec) continue; + + if (mod_map) + save_ftrace_mod_rec(mod_map, rec); + pg->index--; ftrace_update_tot_cnt--; if (!pg->index) { @@ -5920,7 +6062,7 @@ void __init ftrace_free_init_mem(void) void *start = (void *)(&__init_begin); void *end = (void *)(&__init_end); - ftrace_free_mem(start, end); + ftrace_free_mem(NULL, start, end); } void __init ftrace_init(void) -- cgit From 6aa69784b43eb5f69120339938c50a97a433049f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 5 Sep 2017 19:20:16 -0400 Subject: ftrace: Add freeing algorithm to free ftrace_mod_maps The ftrace_mod_map is a descriptor to save module init function names in case they were traced, and the trace output needs to reference the function name from the function address. But after the function is unloaded, it the maps should be freed, as the rest of the function names are as well. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 86dbbfb353db..a5824408bed9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5683,6 +5683,7 @@ struct ftrace_mod_func { }; struct ftrace_mod_map { + struct rcu_head rcu; struct list_head list; struct module *mod; unsigned long start_addr; @@ -5694,6 +5695,8 @@ struct ftrace_mod_map { #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) +static LIST_HEAD(ftrace_mod_maps); + static int referenced_filters(struct dyn_ftrace *rec) { struct ftrace_ops *ops; @@ -5747,8 +5750,26 @@ static void clear_mod_from_hashes(struct ftrace_page *pg) mutex_unlock(&trace_types_lock); } +static void ftrace_free_mod_map(struct rcu_head *rcu) +{ + struct ftrace_mod_map *mod_map = container_of(rcu, struct ftrace_mod_map, rcu); + struct ftrace_mod_func *mod_func; + struct ftrace_mod_func *n; + + /* All the contents of mod_map are now not visible to readers */ + list_for_each_entry_safe(mod_func, n, &mod_map->funcs, list) { + kfree(mod_func->name); + list_del(&mod_func->list); + kfree(mod_func); + } + + kfree(mod_map); +} + void ftrace_release_mod(struct module *mod) { + struct ftrace_mod_map *mod_map; + struct ftrace_mod_map *n; struct dyn_ftrace *rec; struct ftrace_page **last_pg; struct ftrace_page *tmp_page = NULL; @@ -5760,6 +5781,14 @@ void ftrace_release_mod(struct module *mod) if (ftrace_disabled) goto out_unlock; + list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) { + if (mod_map->mod == mod) { + list_del_rcu(&mod_map->list); + call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map); + break; + } + } + /* * Each module has its own ftrace_pages, remove * them from the list. @@ -5914,8 +5943,6 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, list_add_rcu(&mod_func->list, &mod_map->funcs); } -static LIST_HEAD(ftrace_mod_maps); - static struct ftrace_mod_map * allocate_ftrace_mod_map(struct module *mod, unsigned long start, unsigned long end) @@ -5974,6 +6001,7 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, struct ftrace_mod_map *mod_map; const char *ret = NULL; + /* mod_map is freed via call_rcu_sched() */ preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym); -- cgit From 6171a0310a06a7a0cb83713fa7068bdd4192de19 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 6 Sep 2017 08:40:41 -0400 Subject: ftrace/kallsyms: Have /proc/kallsyms show saved mod init functions If a module is loaded while tracing is enabled, then there's a possibility that the module init functions were traced. These functions have their name and address stored by ftrace such that it can translate the function address that is written into the buffer into a human readable function name. As userspace tools may be doing the same, they need a way to map function names to their address as well. This is done through reading /proc/kallsyms. Signed-off-by: Steven Rostedt (VMware) --- kernel/kallsyms.c | 38 ++++++++++++++++++++++++++++++++------ kernel/trace/ftrace.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 976ecb9275d9..1966fe1c2b57 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -479,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol); struct kallsym_iter { loff_t pos; loff_t pos_mod_end; + loff_t pos_ftrace_mod_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -501,11 +502,25 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) return 1; } +static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) +{ + int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, + &iter->value, &iter->type, + iter->name, iter->module_name, + &iter->exported); + if (ret < 0) { + iter->pos_ftrace_mod_end = iter->pos; + return 0; + } + + return 1; +} + static int get_ksymbol_bpf(struct kallsym_iter *iter) { iter->module_name[0] = '\0'; iter->exported = 0; - return bpf_get_kallsym(iter->pos - iter->pos_mod_end, + return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, iter->name) < 0 ? 0 : 1; } @@ -530,20 +545,31 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->name[0] = '\0'; iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; - if (new_pos == 0) + if (new_pos == 0) { iter->pos_mod_end = 0; + iter->pos_ftrace_mod_end = 0; + } } static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; - if (iter->pos_mod_end > 0 && - iter->pos_mod_end < iter->pos) + if (iter->pos_ftrace_mod_end > 0 && + iter->pos_ftrace_mod_end < iter->pos) return get_ksymbol_bpf(iter); - if (!get_ksymbol_mod(iter)) - return get_ksymbol_bpf(iter); + if (iter->pos_mod_end > 0 && + iter->pos_mod_end < iter->pos) { + if (!get_ksymbol_ftrace_mod(iter)) + return get_ksymbol_bpf(iter); + return 1; + } + + if (!get_ksymbol_mod(iter)) { + if (!get_ksymbol_ftrace_mod(iter)) + return get_ksymbol_bpf(iter); + } return 1; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a5824408bed9..9e99bd55732e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5689,6 +5689,7 @@ struct ftrace_mod_map { unsigned long start_addr; unsigned long end_addr; struct list_head funcs; + unsigned int num_funcs; }; #ifdef CONFIG_MODULES @@ -5940,6 +5941,8 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, mod_func->ip = rec->ip - offset; mod_func->size = symsize; + mod_map->num_funcs++; + list_add_rcu(&mod_func->list, &mod_map->funcs); } @@ -5956,6 +5959,7 @@ allocate_ftrace_mod_map(struct module *mod, mod_map->mod = mod; mod_map->start_addr = start; mod_map->end_addr = end; + mod_map->num_funcs = 0; INIT_LIST_HEAD_RCU(&mod_map->funcs); @@ -6016,6 +6020,42 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, return ret; } +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, + char *module_name, int *exported) +{ + struct ftrace_mod_map *mod_map; + struct ftrace_mod_func *mod_func; + + preempt_disable(); + list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { + + if (symnum >= mod_map->num_funcs) { + symnum -= mod_map->num_funcs; + continue; + } + + list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) { + if (symnum > 1) { + symnum--; + continue; + } + + *value = mod_func->ip; + *type = 'T'; + strlcpy(name, mod_func->name, KSYM_NAME_LEN); + strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN); + *exported = 1; + preempt_enable(); + return 0; + } + WARN_ON(1); + break; + } + preempt_enable(); + return -ERANGE; +} + #else static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map, struct dyn_ftrace *rec) { } -- cgit From aaecaa0b5f31794f1711247da4b5883a6ff98163 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 5 Oct 2017 17:54:31 -0700 Subject: tracing: Prepare to add preempt and irq trace events In preparation of adding irqsoff and preemptsoff enable and disable trace events, move required functions and code to make it easier to add these events in a later patch. This patch is just code movement and no functional change. Link: http://lkml.kernel.org/r/20171006005432.14244-2-joelaf@google.com Cc: Peter Zijlstra Cc: kernel-team@android.com Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_irqsoff.c | 100 ++++++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 7758bc0617cb..0e3033c00474 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,7 @@ #include "trace.h" +#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -462,64 +463,44 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) #else /* !CONFIG_PROVE_LOCKING */ -/* - * Stubs: - */ - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} - /* * We are only interested in hardirq on/off events: */ -void trace_hardirqs_on(void) +static inline void tracer_hardirqs_on(void) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_on); -void trace_hardirqs_off(void) +static inline void tracer_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_off); -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_on_caller); -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_PROVE_LOCKING */ #endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER -void trace_preempt_on(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } -void trace_preempt_off(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); @@ -781,3 +762,70 @@ __init static int init_irqsoff_tracer(void) return 0; } core_initcall(init_irqsoff_tracer); +#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ + +#ifndef CONFIG_IRQSOFF_TRACER +static inline void tracer_hardirqs_on(void) { } +static inline void tracer_hardirqs_off(void) { } +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { } +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { } +#endif + +#ifndef CONFIG_PREEMPT_TRACER +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#endif + +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) +void trace_hardirqs_on(void) +{ + tracer_hardirqs_on(); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + tracer_hardirqs_off(); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + tracer_hardirqs_on_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + tracer_hardirqs_off_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +/* + * Stubs: + */ + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} +#endif + +#ifdef CONFIG_PREEMPT_TRACER +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + tracer_preempt_on(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + tracer_preempt_off(a0, a1); +} +#endif -- cgit From 1bd845bcb41d5b7f83745e0cb99273eb376f2ec5 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 8 Sep 2017 20:57:09 +0200 Subject: padata: set cpu_index of unused CPUs to -1 The parallel queue per-cpu data structure gets initialized only for CPUs in the 'pcpu' CPU mask set. This is not sufficient as the reorder timer may run on a different CPU and might wrongly decide it's the target CPU for the next reorder item as per-cpu memory gets memset(0) and we might be waiting for the first CPU in cpumask.pcpu, i.e. cpu_index 0. Make the '__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index' compare in padata_get_next() fail in this case by initializing the cpu_index member of all per-cpu parallel queues. Use -1 for unused ones. Signed-off-by: Mathias Krause Signed-off-by: Herbert Xu --- kernel/padata.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 868f947166d7..1b9b4bac4a9b 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -384,8 +384,14 @@ static void padata_init_pqueues(struct parallel_data *pd) struct padata_parallel_queue *pqueue; cpu_index = 0; - for_each_cpu(cpu, pd->cpumask.pcpu) { + for_each_possible_cpu(cpu) { pqueue = per_cpu_ptr(pd->pqueue, cpu); + + if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) { + pqueue->cpu_index = -1; + continue; + } + pqueue->pd = pd; pqueue->cpu_index = cpu_index; cpu_index++; -- cgit From cf5868c8a22dc2854b96e9569064bb92365549ca Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 8 Sep 2017 20:57:10 +0200 Subject: padata: ensure the reorder timer callback runs on the correct CPU The reorder timer function runs on the CPU where the timer interrupt was handled which is not necessarily one of the CPUs of the 'pcpu' CPU mask set. Ensure the padata_reorder() callback runs on the correct CPU, which is one in the 'pcpu' CPU mask set and, preferrably, the next expected one. Do so by comparing the current CPU with the expected target CPU. If they match, call padata_reorder() right away. If they differ, schedule a work item on the target CPU that does the padata_reorder() call for us. Signed-off-by: Mathias Krause Signed-off-by: Herbert Xu --- kernel/padata.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 1b9b4bac4a9b..b4066147bce4 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -275,11 +275,51 @@ static void padata_reorder(struct parallel_data *pd) return; } +static void invoke_padata_reorder(struct work_struct *work) +{ + struct padata_parallel_queue *pqueue; + struct parallel_data *pd; + + local_bh_disable(); + pqueue = container_of(work, struct padata_parallel_queue, reorder_work); + pd = pqueue->pd; + padata_reorder(pd); + local_bh_enable(); +} + static void padata_reorder_timer(unsigned long arg) { struct parallel_data *pd = (struct parallel_data *)arg; + unsigned int weight; + int target_cpu, cpu; - padata_reorder(pd); + cpu = get_cpu(); + + /* We don't lock pd here to not interfere with parallel processing + * padata_reorder() calls on other CPUs. We just need any CPU out of + * the cpumask.pcpu set. It would be nice if it's the right one but + * it doesn't matter if we're off to the next one by using an outdated + * pd->processed value. + */ + weight = cpumask_weight(pd->cpumask.pcpu); + target_cpu = padata_index_to_cpu(pd, pd->processed % weight); + + /* ensure to call the reorder callback on the correct CPU */ + if (cpu != target_cpu) { + struct padata_parallel_queue *pqueue; + struct padata_instance *pinst; + + /* The timer function is serialized wrt itself -- no locking + * needed. + */ + pinst = pd->pinst; + pqueue = per_cpu_ptr(pd->pqueue, target_cpu); + queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work); + } else { + padata_reorder(pd); + } + + put_cpu(); } static void padata_serial_worker(struct work_struct *serial_work) @@ -399,6 +439,7 @@ static void padata_init_pqueues(struct parallel_data *pd) __padata_list_init(&pqueue->reorder); __padata_list_init(&pqueue->parallel); INIT_WORK(&pqueue->work, padata_parallel_worker); + INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder); atomic_set(&pqueue->num_obj, 0); } } -- cgit From 350ef88e7e922354f82a931897ad4a4ce6c686ff Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 8 Sep 2017 20:57:11 +0200 Subject: padata: ensure padata_do_serial() runs on the correct CPU If the algorithm we're parallelizing is asynchronous we might change CPUs between padata_do_parallel() and padata_do_serial(). However, we don't expect this to happen as we need to enqueue the padata object into the per-cpu reorder queue we took it from, i.e. the same-cpu's parallel queue. Ensure we're not switching CPUs for a given padata object by tracking the CPU within the padata object. If the serial callback gets called on the wrong CPU, defer invoking padata_reorder() via a kernel worker on the CPU we're expected to run on. Signed-off-by: Mathias Krause Signed-off-by: Herbert Xu --- kernel/padata.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index b4066147bce4..f262c9a4e70a 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -131,6 +131,7 @@ int padata_do_parallel(struct padata_instance *pinst, padata->cb_cpu = cb_cpu; target_cpu = padata_cpu_hash(pd); + padata->cpu = target_cpu; queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); @@ -363,10 +364,21 @@ void padata_do_serial(struct padata_priv *padata) int cpu; struct padata_parallel_queue *pqueue; struct parallel_data *pd; + int reorder_via_wq = 0; pd = padata->pd; cpu = get_cpu(); + + /* We need to run on the same CPU padata_do_parallel(.., padata, ..) + * was called on -- or, at least, enqueue the padata object into the + * correct per-cpu queue. + */ + if (cpu != padata->cpu) { + reorder_via_wq = 1; + cpu = padata->cpu; + } + pqueue = per_cpu_ptr(pd->pqueue, cpu); spin_lock(&pqueue->reorder.lock); @@ -376,7 +388,13 @@ void padata_do_serial(struct padata_priv *padata) put_cpu(); - padata_reorder(pd); + /* If we're running on the wrong CPU, call padata_reorder() via a + * kernel worker. + */ + if (reorder_via_wq) + queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work); + else + padata_reorder(pd); } EXPORT_SYMBOL(padata_do_serial); -- cgit From 97562633bcbac4a07d605ae628d7655fa71caaf5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:19 -0700 Subject: bpf: perf event change needed for subsequent bpf helpers This patch does not impact existing functionalities. It contains the changes in perf event area needed for subsequent bpf_perf_event_read_value and bpf_perf_prog_read_value helpers. Signed-off-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 2 +- kernel/events/core.c | 15 +++++++++++++-- kernel/trace/bpf_trace.c | 2 +- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..68d866628be0 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - if (perf_event_read_local(event, &value) == -EOPNOTSUPP) + if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..902149f05381 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; + u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + now = event->shadow_ctx_time + perf_clock(); + if (enabled) + *enabled = now - event->tstamp_enabled; /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event->oncpu == smp_processor_id()) { event->pmu->read(event); + if (running) + *running = now - event->tstamp_running; + } else if (running) { + *running = event->total_time_running; + } *value = local64_read(&event->count); out: @@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index dc498b605d5d..95888ae6c263 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value); + err = perf_event_read_local(ee->event, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi -- cgit From 908432ca84fc229e906ba164219e9ad0fe56f755 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:20 -0700 Subject: bpf: add helper bpf_perf_event_read_value for perf event array map Hardware pmu counters are limited resources. When there are more pmu based perf events opened than available counters, kernel will multiplex these events so each event gets certain percentage (but not 100%) of the pmu time. In case that multiplexing happens, the number of samples or counter value will not reflect the case compared to no multiplexing. This makes comparison between different runs difficult. Typically, the number of samples or counter value should be normalized before comparing to other experiments. The typical normalization is done like: normalized_num_samples = num_samples * time_enabled / time_running normalized_counter_value = counter_value * time_enabled / time_running where time_enabled is the time enabled for event and time_running is the time running for event since last normalization. This patch adds helper bpf_perf_event_read_value for kprobed based perf event array map, to read perf counter and enabled/running time. The enabled/running time is accumulated since the perf event open. To achieve scaling factor between two bpf invocations, users can can use cpu_id as the key (which is typical for perf array usage model) to remember the previous value and do the calculation inside the bpf program. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 4 +++- kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 52b022310f6a..590125e29161 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1552,7 +1552,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read && - func_id != BPF_FUNC_perf_event_output) + func_id != BPF_FUNC_perf_event_output && + func_id != BPF_FUNC_perf_event_read_value) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -1595,6 +1596,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: + case BPF_FUNC_perf_event_read_value: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 95888ae6c263..0be86cc0130e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +static __always_inline int +get_map_perf_counter(struct bpf_map *map, u64 flags, + u64 *value, u64 *enabled, u64 *running) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; - u64 value = 0; - int err; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value, NULL, NULL); + return perf_event_read_local(ee->event, value, enabled, running); +} + +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +{ + u64 value = 0; + int err; + + err = get_map_perf_counter(map, flags, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi @@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_event_read_value_proto = { + .func = bpf_perf_event_read_value, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); static __always_inline u64 @@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_perf_event_read_value: + return &bpf_perf_event_read_value_proto; default: return tracing_func_proto(func_id); } -- cgit From 4bebdc7a85aa400c0222b5329861e4ad9252f1e5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 5 Oct 2017 09:19:22 -0700 Subject: bpf: add helper bpf_perf_prog_read_value This patch adds helper bpf_perf_prog_read_cvalue for perf event based bpf programs, to read event counter and enabled/running time. The enabled/running time is accumulated since the perf event open. The typical use case for perf event based bpf program is to attach itself to a single event. In such cases, if it is desirable to get scaling factor between two bpf invocations, users can can save the time values in a map, and use the value from the map and the current value to calculate the scaling factor. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0be86cc0130e..04ea5314f2bc 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -613,6 +613,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { + .func = bpf_perf_prog_read_value_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -620,6 +646,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_perf_prog_read_value: + return &bpf_perf_prog_read_value_proto_tp; default: return tracing_func_proto(func_id); } -- cgit From 473d97343f94ff20f5196078314e4dd83156d3a2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 5 Oct 2017 21:52:11 -0700 Subject: bpf: Change bpf_obj_name_cpy() to better ensure map's name is init by 0 During get_info_by_fd, the prog/map name is memcpy-ed. It depends on the prog->aux->name and map->name to be zero initialized. bpf_prog_aux is easy to guarantee that aux->name is zero init. The name in bpf_map may be harder to be guaranteed in the future when new map type is added. Hence, this patch makes bpf_obj_name_cpy() to always zero init the prog/map name. Suggested-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0048cb24ba7b..d124e702e040 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -322,6 +322,8 @@ static int bpf_obj_name_cpy(char *dst, const char *src) { const char *end = src + BPF_OBJ_NAME_LEN; + memset(dst, 0, BPF_OBJ_NAME_LEN); + /* Copy all isalnum() and '_' char */ while (src < end && *src) { if (!isalnum(*src) && *src != '_') @@ -333,9 +335,6 @@ static int bpf_obj_name_cpy(char *dst, const char *src) if (src == end) return -EINVAL; - /* '\0' terminates dst */ - *dst = 0; - return 0; } -- cgit From 368211fb920a0b789c238942c6af0414539f79d6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 5 Oct 2017 21:52:13 -0700 Subject: bpf: Append prog->aux->name in bpf_get_prog_name() This patch makes the bpf_prog's name available in kallsyms. The new format is bpf_prog_tag[_name]. Sample kallsyms from running selftests/bpf/test_progs: [root@arch-fb-vm1 ~]# egrep ' bpf_prog_[0-9a-fA-F]{16}' /proc/kallsyms ffffffffa0048000 t bpf_prog_dabf0207d1992486_test_obj_id ffffffffa0038000 t bpf_prog_a04f5eef06a7f555__123456789ABCDE ffffffffa0050000 t bpf_prog_a04f5eef06a7f555 Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c6be15ae83ee..248961af2421 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -309,12 +309,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { + const char *end = sym + KSYM_NAME_LEN; + BUILD_BUG_ON(sizeof("bpf_prog_") + - sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + sizeof(prog->tag) * 2 + + /* name has been null terminated. + * We should need +1 for the '_' preceding + * the name. However, the null character + * is double counted between the name and the + * sizeof("bpf_prog_") above, so we omit + * the +1 here. + */ + sizeof(prog->aux->name) > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); - *sym = 0; + if (prog->aux->name[0]) + snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); + else + *sym = 0; } static __always_inline unsigned long -- cgit From de8cd83e91bc3ee212b3e6ec6e4283af9e4ab269 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Mon, 2 Oct 2017 20:21:39 -0400 Subject: audit: Record fanotify access control decisions The fanotify interface allows user space daemons to make access control decisions. Under common criteria requirements, we need to optionally record decisions based on policy. This patch adds a bit mask, FAN_AUDIT, that a user space daemon can 'or' into the response decision which will tell the kernel that it made a decision and record it. It would be used something like this in user space code: response.response = FAN_DENY | FAN_AUDIT; write(fd, &response, sizeof(struct fanotify_response)); When the syscall ends, the audit system will record the decision as a AUDIT_FANOTIFY auxiliary record to denote that the reason this event occurred is the result of an access control decision from fanotify rather than DAC or MAC policy. A sample event looks like this: type=PATH msg=audit(1504310584.332:290): item=0 name="./evil-ls" inode=1319561 dev=fc:03 mode=0100755 ouid=1000 ogid=1000 rdev=00:00 obj=unconfined_u:object_r:user_home_t:s0 nametype=NORMAL type=CWD msg=audit(1504310584.332:290): cwd="/home/sgrubb" type=SYSCALL msg=audit(1504310584.332:290): arch=c000003e syscall=2 success=no exit=-1 a0=32cb3fca90 a1=0 a2=43 a3=8 items=1 ppid=901 pid=959 auid=1000 uid=1000 gid=1000 euid=1000 suid=1000 fsuid=1000 egid=1000 sgid=1000 fsgid=1000 tty=pts1 ses=3 comm="bash" exe="/usr/bin/bash" subj=unconfined_u:unconfined_r:unconfined_t: s0-s0:c0.c1023 key=(null) type=FANOTIFY msg=audit(1504310584.332:290): resp=2 Prior to using the audit flag, the developer needs to call fanotify_init or'ing in FAN_ENABLE_AUDIT to ensure that the kernel supports auditing. The calling process must also have the CAP_AUDIT_WRITE capability. Signed-off-by: sgrubb Reviewed-by: Amir Goldstein Signed-off-by: Jan Kara --- kernel/auditsc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ecc23e25c9eb..9c723e978245 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2390,6 +2390,12 @@ void __audit_log_kern_module(char *name) context->type = AUDIT_KERN_MODULE; } +void __audit_fanotify(unsigned int response) +{ + audit_log(current->audit_context, GFP_KERNEL, + AUDIT_FANOTIFY, "resp=%u", response); +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; -- cgit From e7bf8249e8f1bac64885eeccb55bcf6111901a81 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:10 -0700 Subject: bpf: encapsulate verifier log state into a structure Put the loose log_* variables into a structure. This will make it simpler to remove the global verifier state in following patches. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6352a88ca6d1..e53458b02249 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -156,8 +156,7 @@ struct bpf_call_arg_meta { /* verbose verifier prints what it's seeing * bpf_check() is called under lock, so no race to access these global vars */ -static u32 log_level, log_size, log_len; -static char *log_buf; +static struct bpf_verifer_log verifier_log; static DEFINE_MUTEX(bpf_verifier_lock); @@ -167,13 +166,15 @@ static DEFINE_MUTEX(bpf_verifier_lock); */ static __printf(1, 2) void verbose(const char *fmt, ...) { + struct bpf_verifer_log *log = &verifier_log; va_list args; - if (log_level == 0 || log_len >= log_size - 1) + if (!log->level || bpf_verifier_log_full(log)) return; va_start(args, fmt); - log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + log->len_used += vscnprintf(log->kbuf + log->len_used, + log->len_total - log->len_used, fmt, args); va_end(args); } @@ -886,7 +887,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (log_level) + if (verifier_log.level) print_verifier_state(state); /* The minimum value is only important with signed * comparisons where we can't assume the floor of a @@ -2956,7 +2957,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, verbose("R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; } - if (log_level) + if (verifier_log.level) print_verifier_state(this_branch); return 0; } @@ -3712,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (log_level) { + if (verifier_log.level) { if (do_print_state) verbose("\nfrom %d to %d: safe\n", prev_insn_idx, insn_idx); @@ -3725,8 +3726,9 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (log_level > 1 || (log_level && do_print_state)) { - if (log_level > 1) + if (verifier_log.level > 1 || + (verifier_log.level && do_print_state)) { + if (verifier_log.level > 1) verbose("%d:", insn_idx); else verbose("\nfrom %d to %d:", @@ -3735,7 +3737,7 @@ static int do_check(struct bpf_verifier_env *env) do_print_state = false; } - if (log_level) { + if (verifier_log.level) { verbose("%d: ", insn_idx); print_bpf_insn(env, insn); } @@ -4389,7 +4391,7 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { - char __user *log_ubuf = NULL; + struct bpf_verifer_log *log = &verifier_log; struct bpf_verifier_env *env; int ret = -EINVAL; @@ -4414,23 +4416,23 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* user requested verbose verifier output * and supplied buffer to store the verification trace */ - log_level = attr->log_level; - log_ubuf = (char __user *) (unsigned long) attr->log_buf; - log_size = attr->log_size; - log_len = 0; + log->level = attr->log_level; + log->ubuf = (char __user *) (unsigned long) attr->log_buf; + log->len_total = attr->log_size; + log->len_used = 0; ret = -EINVAL; - /* log_* values have to be sane */ - if (log_size < 128 || log_size > UINT_MAX >> 8 || - log_level == 0 || log_ubuf == NULL) + /* log attributes have to be sane */ + if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + !log->level || !log->ubuf) goto err_unlock; ret = -ENOMEM; - log_buf = vmalloc(log_size); - if (!log_buf) + log->kbuf = vmalloc(log->len_total); + if (!log->kbuf) goto err_unlock; } else { - log_level = 0; + log->level = 0; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -4467,15 +4469,16 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); - if (log_level && log_len >= log_size - 1) { - BUG_ON(log_len >= log_size); + if (log->level && bpf_verifier_log_full(log)) { + BUG_ON(log->len_used >= log->len_total); /* verifier log exceeded user supplied buffer */ ret = -ENOSPC; /* fall through to return what was recorded */ } /* copy verifier log back to user space including trailing zero */ - if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + if (log->level && copy_to_user(log->ubuf, log->kbuf, + log->len_used + 1) != 0) { ret = -EFAULT; goto free_log_buf; } @@ -4502,8 +4505,8 @@ skip_full_check: } free_log_buf: - if (log_level) - vfree(log_buf); + if (log->level) + vfree(log->kbuf); if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. @@ -4540,7 +4543,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); - log_level = 0; + verifier_log.level = 0; env->strict_alignment = false; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) -- cgit From 61bd5218eef349fcacc4976a251bc83a4748b4af Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:11 -0700 Subject: bpf: move global verifier log into verifier environment The biggest piece of global state protected by the verifier lock is the verifier_log. Move that log to struct bpf_verifier_env. struct bpf_verifier_env has to be passed now to all invocations of verbose(). Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 491 ++++++++++++++++++++++++++------------------------ 1 file changed, 259 insertions(+), 232 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e53458b02249..a352f93cd4b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -153,20 +153,16 @@ struct bpf_call_arg_meta { int access_size; }; -/* verbose verifier prints what it's seeing - * bpf_check() is called under lock, so no race to access these global vars - */ -static struct bpf_verifer_log verifier_log; - static DEFINE_MUTEX(bpf_verifier_lock); /* log_level controls verbosity level of eBPF verifier. * verbose() is used to dump the verification trace to the log, so the user * can figure out what's wrong with the program */ -static __printf(1, 2) void verbose(const char *fmt, ...) +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, + const char *fmt, ...) { - struct bpf_verifer_log *log = &verifier_log; + struct bpf_verifer_log *log = &env->log; va_list args; if (!log->level || bpf_verifier_log_full(log)) @@ -214,7 +210,8 @@ static const char *func_id_name(int id) return "unknown"; } -static void print_verifier_state(struct bpf_verifier_state *state) +static void print_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *state) { struct bpf_reg_state *reg; enum bpf_reg_type t; @@ -225,21 +222,21 @@ static void print_verifier_state(struct bpf_verifier_state *state) t = reg->type; if (t == NOT_INIT) continue; - verbose(" R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d=%s", i, reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ - verbose("%lld", reg->var_off.value + reg->off); + verbose(env, "%lld", reg->var_off.value + reg->off); } else { - verbose("(id=%d", reg->id); + verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) - verbose(",off=%d", reg->off); + verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) - verbose(",r=%d", reg->range); + verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) - verbose(",ks=%d,vs=%d", + verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { @@ -247,38 +244,38 @@ static void print_verifier_state(struct bpf_verifier_state *state) * could be a pointer whose offset is too big * for reg->off */ - verbose(",imm=%llx", reg->var_off.value); + verbose(env, ",imm=%llx", reg->var_off.value); } else { if (reg->smin_value != reg->umin_value && reg->smin_value != S64_MIN) - verbose(",smin_value=%lld", + verbose(env, ",smin_value=%lld", (long long)reg->smin_value); if (reg->smax_value != reg->umax_value && reg->smax_value != S64_MAX) - verbose(",smax_value=%lld", + verbose(env, ",smax_value=%lld", (long long)reg->smax_value); if (reg->umin_value != 0) - verbose(",umin_value=%llu", + verbose(env, ",umin_value=%llu", (unsigned long long)reg->umin_value); if (reg->umax_value != U64_MAX) - verbose(",umax_value=%llu", + verbose(env, ",umax_value=%llu", (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(",var_off=%s", tn_buf); + verbose(env, ",var_off=%s", tn_buf); } } - verbose(")"); + verbose(env, ")"); } } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) - verbose(" fp%d=%s", -MAX_BPF_STACK + i, + verbose(env, " fp%d=%s", -MAX_BPF_STACK + i, reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); } - verbose("\n"); + verbose(env, "\n"); } static const char *const bpf_class_string[] = { @@ -333,15 +330,15 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_end_insn(const struct bpf_verifier_env *env, +static void print_bpf_end_insn(struct bpf_verifier_env *env, const struct bpf_insn *insn) { - verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } -static void print_bpf_insn(const struct bpf_verifier_env *env, +static void print_bpf_insn(struct bpf_verifier_env *env, const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); @@ -349,23 +346,23 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) - verbose("BUG_alu64_%02x\n", insn->code); + verbose(env, "BUG_alu64_%02x\n", insn->code); else print_bpf_end_insn(env, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose("(%02x) r%d = %s-r%d\n", + verbose(env, "(%02x) r%d = %s-r%d\n", insn->code, insn->dst_reg, class == BPF_ALU ? "(u32) " : "", insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose("(%02x) %sr%d %s %sr%d\n", + verbose(env, "(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); } else { - verbose("(%02x) %sr%d %s %s%d\n", + verbose(env, "(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], @@ -374,46 +371,46 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) - verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_XADD) - verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else - verbose("BUG_%02x\n", insn->code); + verbose(env, "BUG_%02x\n", insn->code); } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_st_%02x\n", insn->code); + verbose(env, "BUG_st_%02x\n", insn->code); return; } - verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_ldx_%02x\n", insn->code); + verbose(env, "BUG_ldx_%02x\n", insn->code); return; } - verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { - verbose("(%02x) r0 = *(%s *)skb[%d]\n", + verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); @@ -428,36 +425,37 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, if (map_ptr && !env->allow_ptr_leaks) imm = 0; - verbose("(%02x) r%d = 0x%llx\n", insn->code, + verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, insn->dst_reg, (unsigned long long)imm); } else { - verbose("BUG_ld_%02x\n", insn->code); + verbose(env, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose("(%02x) call %s#%d\n", insn->code, + verbose(env, "(%02x) call %s#%d\n", insn->code, func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose("(%02x) goto pc%+d\n", + verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose("(%02x) exit\n", insn->code); + verbose(env, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose("(%02x) if r%d %s r%d goto pc%+d\n", + verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->src_reg, insn->off); } else { - verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } } else { - verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); + verbose(env, "(%02x) %s\n", + insn->code, bpf_class_string[class]); } } @@ -496,7 +494,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->head = elem; env->stack_size++; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose("BPF program is too complex\n"); + verbose(env, "BPF program is too complex\n"); goto err; } return &elem->st; @@ -534,10 +532,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } -static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_known_zero(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_known_zero(regs, %u)\n", regno); + verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -647,10 +646,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) __mark_reg_unbounded(reg); } -static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_unknown(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_unknown(regs, %u)\n", regno); + verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -665,10 +665,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg) reg->type = NOT_INIT; } -static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_not_init(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_not_init(regs, %u)\n", regno); + verbose(env, "mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -677,22 +678,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) __mark_reg_not_init(regs + regno); } -static void init_reg_state(struct bpf_reg_state *regs) +static void init_reg_state(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) { int i; for (i = 0; i < MAX_BPF_REG; i++) { - mark_reg_not_init(regs, i); + mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; } /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; - mark_reg_known_zero(regs, BPF_REG_FP); + mark_reg_known_zero(env, regs, BPF_REG_FP); /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(regs, BPF_REG_1); + mark_reg_known_zero(env, regs, BPF_REG_1); } enum reg_arg_type { @@ -726,26 +728,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *regs = env->cur_state.regs; if (regno >= MAX_BPF_REG) { - verbose("R%d is invalid\n", regno); + verbose(env, "R%d is invalid\n", regno); return -EINVAL; } if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (regs[regno].type == NOT_INIT) { - verbose("R%d !read_ok\n", regno); + verbose(env, "R%d !read_ok\n", regno); return -EACCES; } mark_reg_read(&env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { - verbose("frame pointer is read only\n"); + verbose(env, "frame pointer is read only\n"); return -EACCES; } regs[regno].live |= REG_LIVE_WRITTEN; if (t == DST_OP) - mark_reg_unknown(regs, regno); + mark_reg_unknown(env, regs, regno); } return 0; } @@ -770,7 +772,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_state *state, int off, +static int check_stack_write(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; @@ -783,7 +786,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off, /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } @@ -818,7 +821,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo } } -static int check_stack_read(struct bpf_verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { u8 *slot_type; @@ -828,12 +832,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, if (slot_type[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { if (slot_type[i] != STACK_SPILL) { - verbose("corrupted spill memory\n"); + verbose(env, "corrupted spill memory\n"); return -EACCES; } } @@ -849,14 +853,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } else { for (i = 0; i < size; i++) { if (slot_type[i] != STACK_MISC) { - verbose("invalid read from stack off %d+%d size %d\n", + verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@ -868,7 +872,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, struct bpf_map *map = env->cur_state.regs[regno].map_ptr; if (off < 0 || size <= 0 || off + size > map->value_size) { - verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; } @@ -887,8 +891,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (verifier_log.level) - print_verifier_state(state); + if (env->log.level) + print_verifier_state(env, state); /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our @@ -896,13 +900,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * will have a set floor within our range. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_map_access(env, regno, reg->smin_value + off, size); if (err) { - verbose("R%d min value is outside of the array range\n", regno); + verbose(env, "R%d min value is outside of the array range\n", + regno); return err; } @@ -911,13 +916,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", regno); return -EACCES; } err = __check_map_access(env, regno, reg->umax_value + off, size); if (err) - verbose("R%d max value is outside of the array range\n", regno); + verbose(env, "R%d max value is outside of the array range\n", + regno); return err; } @@ -956,7 +962,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *reg = ®s[regno]; if (off < 0 || size <= 0 || (u64)off + size > reg->range) { - verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; } @@ -979,13 +985,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, * detail to prove they're safe. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_packet_access(env, regno, off, size); if (err) { - verbose("R%d offset is outside of the packet\n", regno); + verbose(env, "R%d offset is outside of the packet\n", regno); return err; } return err; @@ -1021,7 +1027,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return 0; } - verbose("invalid bpf_context access off=%d size=%d\n", off, size); + verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; } @@ -1039,7 +1045,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); } -static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, +static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int off, int size, bool strict) { struct tnum reg_off; @@ -1064,7 +1071,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned packet access off %d+%s+%d+%d size %d\n", + verbose(env, + "misaligned packet access off %d+%s+%d+%d size %d\n", ip_align, tn_buf, reg->off, off, size); return -EACCES; } @@ -1072,7 +1080,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, return 0; } -static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, +static int check_generic_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, const char *pointer_desc, int off, int size, bool strict) { @@ -1087,7 +1096,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned %saccess off %s+%d+%d size %d\n", + verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", pointer_desc, tn_buf, reg->off, off, size); return -EACCES; } @@ -1108,7 +1117,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, /* Special case, because of NET_IP_ALIGN. Given metadata sits * right in front, treat it the very same way. */ - return check_pkt_ptr_alignment(reg, off, size, strict); + return check_pkt_ptr_alignment(env, reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1121,7 +1130,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, default: break; } - return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict); + return check_generic_ptr_alignment(env, reg, pointer_desc, off, size, + strict); } /* check whether memory at (regno + off) is accessible for t = (read | write) @@ -1153,20 +1163,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into map\n", value_regno); + verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into ctx\n", value_regno); + verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } /* ctx accesses must be at a fixed offset, so that we can @@ -1176,7 +1186,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable ctx access var_off=%s off=%d size=%d", + verbose(env, + "variable ctx access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } @@ -1188,9 +1199,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); else - mark_reg_known_zero(state->regs, value_regno); + mark_reg_known_zero(env, state->regs, + value_regno); state->regs[value_regno].id = 0; state->regs[value_regno].off = 0; state->regs[value_regno].range = 0; @@ -1206,13 +1218,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } off += reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK) { - verbose("invalid stack off=%d size=%d\n", off, size); + verbose(env, "invalid stack off=%d size=%d\n", off, + size); return -EACCES; } @@ -1223,29 +1236,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!env->allow_ptr_leaks && state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && size != BPF_REG_SIZE) { - verbose("attempt to corrupt spilled pointer on stack\n"); + verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; } - err = check_stack_write(state, off, size, value_regno); + err = check_stack_write(env, state, off, size, + value_regno); } else { - err = check_stack_read(state, off, size, value_regno); + err = check_stack_read(env, state, off, size, + value_regno); } } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { - verbose("cannot write into packet\n"); + verbose(env, "cannot write into packet\n"); return -EACCES; } if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into packet\n", value_regno); + verbose(env, "R%d leaks addr into packet\n", + value_regno); return -EACCES; } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); } else { - verbose("R%d invalid mem access '%s'\n", - regno, reg_type_str[reg->type]); + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str[reg->type]); return -EACCES; } @@ -1265,7 +1281,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || insn->imm != 0) { - verbose("BPF_XADD uses reserved fields\n"); + verbose(env, "BPF_XADD uses reserved fields\n"); return -EINVAL; } @@ -1280,7 +1296,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d leaks addr into mem\n", insn->src_reg); + verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; } @@ -1321,7 +1337,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, register_is_null(regs[regno])) return 0; - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[regs[regno].type], reg_type_str[PTR_TO_STACK]); return -EACCES; @@ -1332,13 +1348,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); - verbose("invalid variable stack read R%d var_off=%s\n", + verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size <= 0) { - verbose("invalid stack type R%d off=%d access_size=%d\n", + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", regno, off, access_size); return -EACCES; } @@ -1354,7 +1370,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, for (i = 0; i < access_size; i++) { if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { - verbose("invalid indirect read from stack off %d+%d size %d\n", + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; } @@ -1397,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_ANYTHING) { if (is_pointer_value(env, regno)) { - verbose("R%d leaks addr into helper function\n", regno); + verbose(env, "R%d leaks addr into helper function\n", + regno); return -EACCES; } return 0; @@ -1405,7 +1422,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { - verbose("helper access to the packet is not allowed\n"); + verbose(env, "helper access to the packet is not allowed\n"); return -EACCES; } @@ -1443,7 +1460,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { - verbose("unsupported arg_type %d\n", arg_type); + verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; } @@ -1461,7 +1478,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->key\n"); + verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } if (type_is_pkt_pointer(type)) @@ -1477,7 +1494,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->value\n"); + verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } if (type_is_pkt_pointer(type)) @@ -1497,7 +1514,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_SIZE cannot be first argument\n"); + verbose(env, + "ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } @@ -1514,7 +1532,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta = NULL; if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", regno); return -EACCES; } @@ -1528,7 +1546,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", regno); return -EACCES; } @@ -1539,12 +1557,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return err; err_type: - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[type], reg_type_str[expected_type]); return -EACCES; } -static int check_map_func_compatibility(struct bpf_map *map, int func_id) +static int check_map_func_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, int func_id) { if (!map) return 0; @@ -1632,7 +1651,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %s#%d\n", + verbose(env, "cannot pass map_type %d into func %s#%d\n", map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1666,7 +1685,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) for (i = 0; i < MAX_BPF_REG; i++) if (reg_is_pkt_pointer_any(®s[i])) - mark_reg_unknown(regs, i); + mark_reg_unknown(env, regs, i); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) @@ -1688,7 +1707,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "invalid func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } @@ -1696,13 +1716,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) fn = env->prog->aux->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "unknown func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } /* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) { - verbose("cannot call GPL only function from proprietary program\n"); + verbose(env, "cannot call GPL only function from proprietary program\n"); return -EINVAL; } @@ -1716,7 +1737,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); return err; } @@ -1749,14 +1770,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { @@ -1764,14 +1785,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ - mark_reg_known_zero(regs, BPF_REG_0); + mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ if (meta.map_ptr == NULL) { - verbose("kernel subsystem misconfigured verifier\n"); + verbose(env, + "kernel subsystem misconfigured verifier\n"); return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; @@ -1782,12 +1804,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) else if (insn_aux->map_ptr != meta.map_ptr) insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { - verbose("unknown return type %d of func %s#%d\n", + verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } - err = check_map_func_compatibility(meta.map_ptr, func_id); + err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; @@ -1846,39 +1868,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: known but bad sbounds\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, + "verifier internal error: known but bad sbounds\n"); return -EINVAL; } if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: known but bad ubounds\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, + "verifier internal error: known but bad ubounds\n"); return -EINVAL; } if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ if (!env->allow_ptr_leaks) - verbose("R%d 32-bit pointer arithmetic prohibited\n", + verbose(env, + "R%d 32-bit pointer arithmetic prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", dst); return -EACCES; } if (ptr_reg->type == CONST_PTR_TO_MAP) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_PACKET_END) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", dst); return -EACCES; } @@ -1943,7 +1968,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ if (!env->allow_ptr_leaks) - verbose("R%d tried to subtract pointer from scalar\n", + verbose(env, "R%d tried to subtract pointer from scalar\n", dst); return -EACCES; } @@ -1953,7 +1978,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, */ if (ptr_reg->type == PTR_TO_STACK) { if (!env->allow_ptr_leaks) - verbose("R%d subtraction from stack pointer prohibited\n", + verbose(env, "R%d subtraction from stack pointer prohibited\n", dst); return -EACCES; } @@ -2008,13 +2033,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * ptr &= ~3 which would reduce min_value by 3.) */ if (!env->allow_ptr_leaks) - verbose("R%d bitwise operator %s on pointer prohibited\n", + verbose(env, "R%d bitwise operator %s on pointer prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic with %s operator prohibited\n", + verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; } @@ -2180,7 +2205,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 63 are undefined. This includes * shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* We lose all sign bit information (except what we can pick @@ -2208,7 +2233,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 63 are undefined. This includes * shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* BPF_RSH is an unsigned shift, so make the appropriate casts */ @@ -2236,7 +2261,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; default: - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } @@ -2268,12 +2293,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. */ if (!env->allow_ptr_leaks) { - verbose("R%d pointer %s pointer prohibited\n", + verbose(env, "R%d pointer %s pointer prohibited\n", insn->dst_reg, bpf_alu_string[opcode >> 4]); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); return 0; } else { /* scalar += pointer @@ -2325,13 +2350,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: unexpected ptr_reg\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: no src_reg\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); @@ -2349,14 +2374,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) != 0 || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { - verbose("BPF_NEG uses reserved fields\n"); + verbose(env, "BPF_NEG uses reserved fields\n"); return -EINVAL; } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || BPF_CLASS(insn->code) == BPF_ALU64) { - verbose("BPF_END uses reserved fields\n"); + verbose(env, "BPF_END uses reserved fields\n"); return -EINVAL; } } @@ -2367,7 +2392,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer arithmetic prohibited\n", + verbose(env, "R%d pointer arithmetic prohibited\n", insn->dst_reg); return -EACCES; } @@ -2381,7 +2406,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -2391,7 +2416,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } } @@ -2411,11 +2436,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d partial copy of pointer\n", + verbose(env, + "R%d partial copy of pointer\n", insn->src_reg); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); /* high 32 bits are known zero. */ regs[insn->dst_reg].var_off = tnum_cast( regs[insn->dst_reg].var_off, 4); @@ -2430,14 +2456,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else if (opcode > BPF_END) { - verbose("invalid BPF_ALU opcode %x\n", opcode); + verbose(env, "invalid BPF_ALU opcode %x\n", opcode); return -EINVAL; } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } /* check src1 operand */ @@ -2446,7 +2472,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } } @@ -2458,7 +2484,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if ((opcode == BPF_MOD || opcode == BPF_DIV) && BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { - verbose("div by zero\n"); + verbose(env, "div by zero\n"); return -EINVAL; } @@ -2467,7 +2493,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; if (insn->imm < 0 || insn->imm >= size) { - verbose("invalid shift %d\n", insn->imm); + verbose(env, "invalid shift %d\n", insn->imm); return -EINVAL; } } @@ -2820,13 +2846,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, int err; if (opcode > BPF_JSLE) { - verbose("invalid BPF_JMP opcode %x\n", opcode); + verbose(env, "invalid BPF_JMP opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } @@ -2836,13 +2862,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d pointer comparison prohibited\n", + verbose(env, "R%d pointer comparison prohibited\n", insn->src_reg); return -EACCES; } } else { if (insn->src_reg != BPF_REG_0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } } @@ -2954,11 +2980,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, find_good_pkt_pointers(this_branch, ®s[insn->src_reg], PTR_TO_PACKET_META); } else if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer comparison prohibited\n", insn->dst_reg); + verbose(env, "R%d pointer comparison prohibited\n", + insn->dst_reg); return -EACCES; } - if (verifier_log.level) - print_verifier_state(this_branch); + if (env->log.level) + print_verifier_state(env, this_branch); return 0; } @@ -2977,11 +3004,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) int err; if (BPF_SIZE(insn->code) != BPF_DW) { - verbose("invalid BPF_LD_IMM insn\n"); + verbose(env, "invalid BPF_LD_IMM insn\n"); return -EINVAL; } if (insn->off != 0) { - verbose("BPF_LD_IMM64 uses reserved fields\n"); + verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); return -EINVAL; } @@ -3039,14 +3066,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) int i, err; if (!may_access_skb(env->prog->type)) { - verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); + verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); + verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -3056,7 +3083,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (regs[BPF_REG_6].type != PTR_TO_CTX) { - verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + verbose(env, + "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; } @@ -3069,7 +3097,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -3077,7 +3105,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * the value fetched from the packet. * Already marked as written above. */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); return 0; } @@ -3097,22 +3125,22 @@ static int check_return_code(struct bpf_verifier_env *env) reg = &env->cur_state.regs[BPF_REG_0]; if (reg->type != SCALAR_VALUE) { - verbose("At program exit the register R0 is not a known value (%s)\n", + verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); return -EINVAL; } if (!tnum_in(range, reg->var_off)) { - verbose("At program exit the register R0 "); + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("has value %s", tn_buf); + verbose(env, "has value %s", tn_buf); } else { - verbose("has unknown scalar value"); + verbose(env, "has unknown scalar value"); } - verbose(" should have been 0 or 1\n"); + verbose(env, " should have been 0 or 1\n"); return -EINVAL; } return 0; @@ -3178,7 +3206,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { - verbose("jump out of range from insn %d to %d\n", t, w); + verbose(env, "jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -3195,13 +3223,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - verbose("back-edge from insn %d to %d\n", t, w); + verbose(env, "back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { /* forward- or cross-edge */ insn_state[t] = DISCOVERED | e; } else { - verbose("insn state internal bug\n"); + verbose(env, "insn state internal bug\n"); return -EFAULT; } return 0; @@ -3295,7 +3323,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; if (cur_stack-- <= 0) { - verbose("pop stack internal bug\n"); + verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; } @@ -3304,7 +3332,7 @@ mark_explored: check_state: for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { - verbose("unreachable insn %d\n", i); + verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } @@ -3685,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env) int insn_processed = 0; bool do_print_state = false; - init_reg_state(regs); + init_reg_state(env, regs); state->parent = NULL; insn_idx = 0; for (;;) { @@ -3694,7 +3722,7 @@ static int do_check(struct bpf_verifier_env *env) int err; if (insn_idx >= insn_cnt) { - verbose("invalid insn idx %d insn_cnt %d\n", + verbose(env, "invalid insn idx %d insn_cnt %d\n", insn_idx, insn_cnt); return -EFAULT; } @@ -3703,7 +3731,8 @@ static int do_check(struct bpf_verifier_env *env) class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { - verbose("BPF program is too large. Processed %d insn\n", + verbose(env, + "BPF program is too large. Processed %d insn\n", insn_processed); return -E2BIG; } @@ -3713,12 +3742,12 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (verifier_log.level) { + if (env->log.level) { if (do_print_state) - verbose("\nfrom %d to %d: safe\n", + verbose(env, "\nfrom %d to %d: safe\n", prev_insn_idx, insn_idx); else - verbose("%d: safe\n", insn_idx); + verbose(env, "%d: safe\n", insn_idx); } goto process_bpf_exit; } @@ -3726,19 +3755,18 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (verifier_log.level > 1 || - (verifier_log.level && do_print_state)) { - if (verifier_log.level > 1) - verbose("%d:", insn_idx); + if (env->log.level > 1 || (env->log.level && do_print_state)) { + if (env->log.level > 1) + verbose(env, "%d:", insn_idx); else - verbose("\nfrom %d to %d:", + verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(&env->cur_state); + print_verifier_state(env, &env->cur_state); do_print_state = false; } - if (verifier_log.level) { - verbose("%d: ", insn_idx); + if (env->log.level) { + verbose(env, "%d: ", insn_idx); print_bpf_insn(env, insn); } @@ -3795,7 +3823,7 @@ static int do_check(struct bpf_verifier_env *env) * src_reg == stack|map in some other branch. * Reject it. */ - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -3835,14 +3863,14 @@ static int do_check(struct bpf_verifier_env *env) } else if (dst_reg_type != *prev_dst_type && (dst_reg_type == PTR_TO_CTX || *prev_dst_type == PTR_TO_CTX)) { - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { - verbose("BPF_ST uses reserved fields\n"); + verbose(env, "BPF_ST uses reserved fields\n"); return -EINVAL; } /* check src operand */ @@ -3865,7 +3893,7 @@ static int do_check(struct bpf_verifier_env *env) insn->off != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_CALL uses reserved fields\n"); + verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } @@ -3878,7 +3906,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_JA uses reserved fields\n"); + verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -3890,7 +3918,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_EXIT uses reserved fields\n"); + verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } @@ -3905,7 +3933,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_pointer_value(env, BPF_REG_0)) { - verbose("R0 leaks addr as return value\n"); + verbose(env, "R0 leaks addr as return value\n"); return -EACCES; } @@ -3940,19 +3968,19 @@ process_bpf_exit: insn_idx++; } else { - verbose("invalid BPF_LD mode\n"); + verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; } } else { - verbose("unknown insn class %d\n", class); + verbose(env, "unknown insn class %d\n", class); return -EINVAL; } insn_idx++; } - verbose("processed %d insns, stack depth %d\n", - insn_processed, env->prog->aux->stack_depth); + verbose(env, "processed %d insns, stack depth %d\n", insn_processed, + env->prog->aux->stack_depth); return 0; } @@ -3964,7 +3992,8 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } -static int check_map_prog_compatibility(struct bpf_map *map, +static int check_map_prog_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, struct bpf_prog *prog) { @@ -3975,12 +4004,12 @@ static int check_map_prog_compatibility(struct bpf_map *map, */ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { if (!check_map_prealloc(map)) { - verbose("perf_event programs can only use preallocated hash map\n"); + verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) { - verbose("perf_event programs can only use preallocated inner hash map\n"); + verbose(env, "perf_event programs can only use preallocated inner hash map\n"); return -EINVAL; } } @@ -4003,14 +4032,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { - verbose("BPF_LDX uses reserved fields\n"); + verbose(env, "BPF_LDX uses reserved fields\n"); return -EINVAL; } if (BPF_CLASS(insn->code) == BPF_STX && ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { - verbose("BPF_STX uses reserved fields\n"); + verbose(env, "BPF_STX uses reserved fields\n"); return -EINVAL; } @@ -4021,7 +4050,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].off != 0) { - verbose("invalid bpf_ld_imm64 insn\n"); + verbose(env, "invalid bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -4030,19 +4059,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) goto next_insn; if (insn->src_reg != BPF_PSEUDO_MAP_FD) { - verbose("unrecognized bpf_ld_imm64 insn\n"); + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } f = fdget(insn->imm); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose("fd %d is not pointing to valid bpf_map\n", + verbose(env, "fd %d is not pointing to valid bpf_map\n", insn->imm); return PTR_ERR(map); } - err = check_map_prog_compatibility(map, env->prog); + err = check_map_prog_compatibility(env, map, env->prog); if (err) { fdput(f); return err; @@ -4164,7 +4194,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); @@ -4212,7 +4242,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 size_code; if (type == BPF_WRITE) { - verbose("bpf verifier narrow ctx access misconfigured\n"); + verbose(env, "bpf verifier narrow ctx access misconfigured\n"); return -EINVAL; } @@ -4231,7 +4261,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || (ctx_field_size && !target_size)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -4313,7 +4343,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -4357,7 +4387,8 @@ patch_call_imm: * programs to call them, must be real in-kernel functions */ if (!fn->func) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, + "kernel subsystem misconfigured func %s#%d\n", func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -4391,8 +4422,8 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { - struct bpf_verifer_log *log = &verifier_log; struct bpf_verifier_env *env; + struct bpf_verifer_log *log; int ret = -EINVAL; /* 'struct bpf_verifier_env' can be global, but since it's not small, @@ -4401,6 +4432,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; + log = &env->log; env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * (*prog)->len); @@ -4419,7 +4451,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log->level = attr->log_level; log->ubuf = (char __user *) (unsigned long) attr->log_buf; log->len_total = attr->log_size; - log->len_used = 0; ret = -EINVAL; /* log attributes have to be sane */ @@ -4431,8 +4462,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) log->kbuf = vmalloc(log->len_total); if (!log->kbuf) goto err_unlock; - } else { - log->level = 0; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -4543,8 +4572,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); - verifier_log.level = 0; - env->strict_alignment = false; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; -- cgit From f4ac7e0b5cc8d16004ac57ff679266d573f30f77 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:12 -0700 Subject: bpf: move instruction printing into a separate file Separate the instruction printing into a standalone source file. This way sneaky code from tools/ can compile it in directly. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 1 + kernel/bpf/disasm.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/disasm.h | 32 ++++++++ kernel/bpf/verifier.c | 202 +---------------------------------------------- 4 files changed, 251 insertions(+), 198 deletions(-) create mode 100644 kernel/bpf/disasm.c create mode 100644 kernel/bpf/disasm.h (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 897daa005b23..53fb09f92e3f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,6 +2,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o ifeq ($(CONFIG_STREAM_PARSER),y) diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c new file mode 100644 index 000000000000..e682850c9715 --- /dev/null +++ b/kernel/bpf/disasm.c @@ -0,0 +1,214 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include + +#include "disasm.h" + +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + +const char *const bpf_class_string[8] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +const char *const bpf_alu_string[16] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[16] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JLT >> 4] = "<", + [BPF_JGE >> 4] = ">=", + [BPF_JLE >> 4] = "<=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSLT >> 4] = "s<", + [BPF_JSGE >> 4] = "s>=", + [BPF_JSLE >> 4] = "s<=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_end_insn(bpf_insn_print_cb verbose, + struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose(env, "BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(verbose, env, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose(env, "(%02x) r%d = %s-r%d\n", + insn->code, insn->dst_reg, + class == BPF_ALU ? "(u32) " : "", + insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + } else { + verbose(env, "(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose(env, "BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_st_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_ldx_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !allow_ptr_leaks) + imm = 0; + + verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); + } else { + verbose(env, "BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose(env, "(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose(env, "(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose(env, "(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose(env, "(%02x) %s\n", + insn->code, bpf_class_string[class]); + } +} diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h new file mode 100644 index 000000000000..8de977e420b6 --- /dev/null +++ b/kernel/bpf/disasm.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef __BPF_DISASM_H__ +#define __BPF_DISASM_H__ + +#include +#include +#include + +extern const char *const bpf_alu_string[16]; +extern const char *const bpf_class_string[8]; + +const char *func_id_name(int id); + +struct bpf_verifier_env; +typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, + const char *, ...); +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks); + +#endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a352f93cd4b2..274c6582ec39 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,8 @@ #include #include +#include "disasm.h" + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -194,22 +196,6 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; -#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) -static const char * const func_id_str[] = { - __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) -}; -#undef __BPF_FUNC_STR_FN - -static const char *func_id_name(int id) -{ - BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - - if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) - return func_id_str[id]; - else - return "unknown"; -} - static void print_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state *state) { @@ -278,187 +264,6 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "\n"); } -static const char *const bpf_class_string[] = { - [BPF_LD] = "ld", - [BPF_LDX] = "ldx", - [BPF_ST] = "st", - [BPF_STX] = "stx", - [BPF_ALU] = "alu", - [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", - [BPF_ALU64] = "alu64", -}; - -static const char *const bpf_alu_string[16] = { - [BPF_ADD >> 4] = "+=", - [BPF_SUB >> 4] = "-=", - [BPF_MUL >> 4] = "*=", - [BPF_DIV >> 4] = "/=", - [BPF_OR >> 4] = "|=", - [BPF_AND >> 4] = "&=", - [BPF_LSH >> 4] = "<<=", - [BPF_RSH >> 4] = ">>=", - [BPF_NEG >> 4] = "neg", - [BPF_MOD >> 4] = "%=", - [BPF_XOR >> 4] = "^=", - [BPF_MOV >> 4] = "=", - [BPF_ARSH >> 4] = "s>>=", - [BPF_END >> 4] = "endian", -}; - -static const char *const bpf_ldst_string[] = { - [BPF_W >> 3] = "u32", - [BPF_H >> 3] = "u16", - [BPF_B >> 3] = "u8", - [BPF_DW >> 3] = "u64", -}; - -static const char *const bpf_jmp_string[16] = { - [BPF_JA >> 4] = "jmp", - [BPF_JEQ >> 4] = "==", - [BPF_JGT >> 4] = ">", - [BPF_JLT >> 4] = "<", - [BPF_JGE >> 4] = ">=", - [BPF_JLE >> 4] = "<=", - [BPF_JSET >> 4] = "&", - [BPF_JNE >> 4] = "!=", - [BPF_JSGT >> 4] = "s>", - [BPF_JSLT >> 4] = "s<", - [BPF_JSGE >> 4] = "s>=", - [BPF_JSLE >> 4] = "s<=", - [BPF_CALL >> 4] = "call", - [BPF_EXIT >> 4] = "exit", -}; - -static void print_bpf_end_insn(struct bpf_verifier_env *env, - const struct bpf_insn *insn) -{ - verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, - BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", - insn->imm, insn->dst_reg); -} - -static void print_bpf_insn(struct bpf_verifier_env *env, - const struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - - if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_OP(insn->code) == BPF_END) { - if (class == BPF_ALU64) - verbose(env, "BUG_alu64_%02x\n", insn->code); - else - print_bpf_end_insn(env, insn); - } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(env, "(%02x) r%d = %s-r%d\n", - insn->code, insn->dst_reg, - class == BPF_ALU ? "(u32) " : "", - insn->dst_reg); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->src_reg); - } else { - verbose(env, "(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->imm); - } - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_MEM) - verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, insn->off, - insn->src_reg); - else - verbose(env, "BUG_%02x\n", insn->code); - } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_st_%02x\n", insn->code); - return; - } - verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); - } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_ldx_%02x\n", insn->code); - return; - } - verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", - insn->code, insn->dst_reg, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->off); - } else if (class == BPF_LD) { - if (BPF_MODE(insn->code) == BPF_ABS) { - verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM && - BPF_SIZE(insn->code) == BPF_DW) { - /* At this point, we already made sure that the second - * part of the ldimm64 insn is accessible. - */ - u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; - - if (map_ptr && !env->allow_ptr_leaks) - imm = 0; - - verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); - } else { - verbose(env, "BUG_ld_%02x\n", insn->code); - return; - } - } else if (class == BPF_JMP) { - u8 opcode = BPF_OP(insn->code); - - if (opcode == BPF_CALL) { - verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); - } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose(env, "(%02x) goto pc%+d\n", - insn->code, insn->off); - } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose(env, "(%02x) exit\n", insn->code); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->src_reg, insn->off); - } else { - verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); - } - } else { - verbose(env, "(%02x) %s\n", - insn->code, bpf_class_string[class]); - } -} - static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) { struct bpf_verifier_stack_elem *elem; @@ -3767,7 +3572,8 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level) { verbose(env, "%d: ", insn_idx); - print_bpf_insn(env, insn); + print_bpf_insn(verbose, env, insn, + env->allow_ptr_leaks); } err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); -- cgit From a2a7d5701052542cd2260e7659b12443e0a74733 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:15 -0700 Subject: bpf: write back the verifier log buffer as it gets filled Verifier log buffer can be quite large (up to 16MB currently). As Eric Dumazet points out if we allow multiple verification requests to proceed simultaneously, malicious user may use the verifier as a way of allocating large amounts of unswappable memory to OOM the host. Switch to a strategy of allocating a smaller buffer (1024B) and writing it out into the user buffer after every print. While at it remove the old BUG_ON(). This is in preparation of the global verifier lock removal. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 274c6582ec39..2cdbcc4f8f6b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -165,15 +165,26 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env, const char *fmt, ...) { struct bpf_verifer_log *log = &env->log; + unsigned int n; va_list args; - if (!log->level || bpf_verifier_log_full(log)) + if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) return; va_start(args, fmt); - log->len_used += vscnprintf(log->kbuf + log->len_used, - log->len_total - log->len_used, fmt, args); + n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); va_end(args); + + WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, + "verifier log line truncated - local buffer too short\n"); + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; + + if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) + log->len_used += n; + else + log->ubuf = NULL; } static bool type_is_pkt_pointer(enum bpf_reg_type type) @@ -4263,11 +4274,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || !log->level || !log->ubuf) goto err_unlock; - - ret = -ENOMEM; - log->kbuf = vmalloc(log->len_total); - if (!log->kbuf) - goto err_unlock; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -4304,18 +4310,11 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); - if (log->level && bpf_verifier_log_full(log)) { - BUG_ON(log->len_used >= log->len_total); - /* verifier log exceeded user supplied buffer */ + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; - /* fall through to return what was recorded */ - } - - /* copy verifier log back to user space including trailing zero */ - if (log->level && copy_to_user(log->ubuf, log->kbuf, - log->len_used + 1) != 0) { + if (log->level && !log->ubuf) { ret = -EFAULT; - goto free_log_buf; + goto err_release_maps; } if (ret == 0 && env->used_map_cnt) { @@ -4326,7 +4325,7 @@ skip_full_check: if (!env->prog->aux->used_maps) { ret = -ENOMEM; - goto free_log_buf; + goto err_release_maps; } memcpy(env->prog->aux->used_maps, env->used_maps, @@ -4339,9 +4338,7 @@ skip_full_check: convert_pseudo_ld_imm64(env); } -free_log_buf: - if (log->level) - vfree(log->kbuf); +err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. -- cgit From d59158162e032917a428704160a2063a02405ec6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 10 Oct 2017 15:51:37 -0700 Subject: tracing: Add support for preempt and irq enable/disable events Preempt and irq trace events can be used for tracing the start and end of an atomic section which can be used by a trace viewer like systrace to graphically view the start and end of an atomic section and correlate them with latencies and scheduling issues. This also serves as a prelude to using synthetic events or probes to rewrite the preempt and irqsoff tracers, along with numerous benefits of using trace events features for these events. Link: http://lkml.kernel.org/r/20171006005432.14244-3-joelaf@google.com Link: http://lkml.kernel.org/r/20171010225137.17370-1-joelaf@google.com Cc: Peter Zilstra Cc: kernel-team@android.com Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 11 +++++++++++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 35 ++++++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 434c840e2d82..b8395a020821 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -160,6 +160,17 @@ config FUNCTION_GRAPH_TRACER address on the current task structure into a stack of calls. +config PREEMPTIRQ_EVENTS + bool "Enable trace events for preempt and irq disable/enable" + select TRACE_IRQFLAGS + depends on DEBUG_PREEMPT || !PROVE_LOCKING + default n + help + Enable tracing of disable and enable events for preemption and irqs. + For tracing preempt disable/enable events, DEBUG_PREEMPT must be + enabled. For tracing irq disable/enable events, PROVE_LOCKING must + be disabled. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 90f2701d92a7..9f62eee61f14 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o +obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 0e3033c00474..03ecb4465ee4 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,9 @@ #include "trace.h" +#define CREATE_TRACE_POINTS +#include + #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -777,26 +780,53 @@ static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } #endif #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) +/* Per-cpu variable to prevent redundant calls when IRQs already off */ +static DEFINE_PER_CPU(int, tracing_irq_cpu); + void trace_hardirqs_on(void) { + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(); + + this_cpu_write(tracing_irq_cpu, 0); } EXPORT_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_off(); } EXPORT_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_on_caller(caller_addr); + + this_cpu_write(tracing_irq_cpu, 0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_off_caller(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -818,14 +848,17 @@ inline void print_irqtrace_events(struct task_struct *curr) } #endif -#ifdef CONFIG_PREEMPT_TRACER +#if defined(CONFIG_PREEMPT_TRACER) || \ + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) void trace_preempt_on(unsigned long a0, unsigned long a1) { + trace_preempt_enable_rcuidle(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { + trace_preempt_disable_rcuidle(a0, a1); tracer_preempt_off(a0, a1); } #endif -- cgit From 8715b108cd75523c9b2e833cdcd7aeb363767f95 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 9 Oct 2017 12:29:31 -0700 Subject: ftrace: Clear hashes of stale ips of init memory Filters should be cleared of init functions during freeing of init memory when the ftrace dyn records are released. However in current code, the filters are left as is. This patch clears the hashes of the saved init functions when the init memory is freed. This fixes the following issue reproducible with the following sequence of commands for a test module: ================================================ void bar(void) { printk(KERN_INFO "bar!\n"); } void foo(void) { printk(KERN_INFO "foo!\n"); bar(); } static int __init hello_init(void) { printk(KERN_INFO "Hello world!\n"); foo(); return 0; } static void __exit hello_cleanup(void) { printk(KERN_INFO "Cleaning up module.\n"); } module_init(hello_init); module_exit(hello_cleanup); ================================================ Commands: echo '*:mod:test' > /d/tracing/set_ftrace_filter echo function > /d/tracing/current_tracer modprobe test rmmod test sleep 1 modprobe test cat /d/tracing/set_ftrace_filter Behavior without patch: Init function is still in the filter Expected behavior: Shouldn't have any of the filters set Link: http://lkml.kernel.org/r/20171009192931.56401-1-joelaf@google.com Signed-off-by: Joel Fernandes Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9e99bd55732e..e0a98225666b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6067,6 +6067,63 @@ allocate_ftrace_mod_map(struct module *mod, } #endif /* CONFIG_MODULES */ +struct ftrace_init_func { + struct list_head list; + unsigned long ip; +}; + +/* Clear any init ips from hashes */ +static void +clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + + if (ftrace_hash_empty(hash)) + return; + + entry = __ftrace_lookup_ip(hash, func->ip); + + /* + * Do not allow this rec to match again. + * Yeah, it may waste some memory, but will be removed + * if/when the hash is modified again. + */ + if (entry) + entry->ip = 0; +} + +static void +clear_func_from_hashes(struct ftrace_init_func *func) +{ + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (!tr->ops || !tr->ops->func_hash) + continue; + mutex_lock(&tr->ops->func_hash->regex_lock); + clear_func_from_hash(func, tr->ops->func_hash->filter_hash); + clear_func_from_hash(func, tr->ops->func_hash->notrace_hash); + mutex_unlock(&tr->ops->func_hash->regex_lock); + } + mutex_unlock(&trace_types_lock); +} + +static void add_to_clear_hash_list(struct list_head *clear_list, + struct dyn_ftrace *rec) +{ + struct ftrace_init_func *func; + + func = kmalloc(sizeof(*func), GFP_KERNEL); + if (!func) { + WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n"); + return; + } + + func->ip = rec->ip; + list_add(&func->list, clear_list); +} + void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) { unsigned long start = (unsigned long)(start_ptr); @@ -6076,8 +6133,12 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) struct dyn_ftrace *rec; struct dyn_ftrace key; struct ftrace_mod_map *mod_map = NULL; + struct ftrace_init_func *func, *func_next; + struct list_head clear_hash; int order; + INIT_LIST_HEAD(&clear_hash); + key.ip = start; key.flags = end; /* overload flags, as it is unsigned long */ @@ -6102,6 +6163,9 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) if (!rec) continue; + /* rec will be cleared from hashes after ftrace_lock unlock */ + add_to_clear_hash_list(&clear_hash, rec); + if (mod_map) save_ftrace_mod_rec(mod_map, rec); @@ -6123,6 +6187,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) goto again; } mutex_unlock(&ftrace_lock); + + list_for_each_entry_safe(func, func_next, &clear_hash, list) { + clear_func_from_hashes(func); + kfree(func); + } } void __init ftrace_free_init_mem(void) -- cgit From 952925dec0f276b407b2abce4aee82cba7c700c3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 11 Oct 2017 11:56:23 +0100 Subject: bpf: remove redundant variable old_flags Variable old_flags is being assigned but is never read; it is redundant and can be removed. Cleans up clang warning: Value stored to 'old_flags' is never read Signed-off-by: Colin Ian King Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e88abc0865d5..3db5a17fcfe8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -192,7 +192,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; - u32 old_flags; int err; if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) @@ -239,7 +238,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = prog; } - old_flags = cgrp->bpf.flags[type]; cgrp->bpf.flags[type] = flags; /* allocate and recompute effective prog arrays */ -- cgit From c5c1ea75a352c3864c4891f36155287a2ed73928 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 13 Jun 2017 13:06:59 +0200 Subject: tracing: Kconfig text fixes for CONFIG_HWLAT_TRACER Trivial spelling fixes for Kconfig help text of config HWLAT_TRACER. Fixes: e7c15cd8a113 ("tracing: Added hardware latency tracer") Signed-off-by: Jesper Dangaard Brouer Acked-by: Steven Rostedt Signed-off-by: Jiri Kosina --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 434c840e2d82..f54b7b6b4a4b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -224,7 +224,7 @@ config HWLAT_TRACER select GENERIC_TRACER help This tracer, when enabled will create one or more kernel threads, - depening on what the cpumask file is set to, which each thread + depending on what the cpumask file is set to, which each thread spinning in a loop looking for interruptions caused by something other than the kernel. For example, if a System Management Interrupt (SMI) takes a noticeable amount of @@ -239,7 +239,7 @@ config HWLAT_TRACER iteration A kernel thread is created that will spin with interrupts disabled - for "width" microseconds in every "widow" cycle. It will not spin + for "width" microseconds in every "window" cycle. It will not spin for "window - width" microseconds, where the system can continue to operate. -- cgit From af41acf8347dd6d11a2a29a11e2866ca4892d600 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 11 Oct 2017 12:46:47 -0400 Subject: printk: Remove superfluous memory barriers from printk_safe The variable printk_safe_irq_ready is set and never cleared at system boot up, when there's only one CPU active. It is set before other CPUs come on line. Also, it is extremely unlikely that an NMI would trigger this early in boot up (which I wonder why we even have this variable at all). Also mark the printk_safe_irq_ready as read mostly, as it is set at system boot up, and never touched again. Link: http://lkml.kernel.org/r/20171011124647.7781f98f@gandalf.local.home Reviewed-by: Petr Mladek Signed-off-by: Steven Rostedt (VMware) --- kernel/printk/printk_safe.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 3cdaeaef9ce1..724d9292d4b9 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -39,7 +39,7 @@ * There are situations when we want to make sure that all buffers * were handled or when IRQs are blocked. */ -static int printk_safe_irq_ready; +static int printk_safe_irq_ready __read_mostly; #define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ sizeof(atomic_t) - \ @@ -63,11 +63,8 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); /* Get flushed in a more safe context. */ static void queue_flush_work(struct printk_safe_seq_buf *s) { - if (printk_safe_irq_ready) { - /* Make sure that IRQ work is really initialized. */ - smp_rmb(); + if (printk_safe_irq_ready) irq_work_queue(&s->work); - } } /* @@ -398,8 +395,12 @@ void __init printk_safe_init(void) #endif } - /* Make sure that IRQ works are initialized before enabling. */ - smp_wmb(); + /* + * In the highly unlikely event that a NMI were to trigger at + * this moment. Make sure IRQ work is set up before this + * variable is set. + */ + barrier(); printk_safe_irq_ready = 1; /* Flush pending messages that did not have scheduled IRQ works. */ -- cgit From c3b5b6ed1eb4f429addfd9e8e8eb39d1a38c85d0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Oct 2017 16:22:20 +0200 Subject: tracing: mark trace_test_buffer as __maybe_unused After trace_selftest_startup_sched_switch is removed, trace_test_buffer() is only used sometimes, leading to this warning: kernel/trace/trace_selftest.c:62:12: error: 'trace_test_buffer' defined but not used [-Werror=unused-function] There is no simple #ifdef condition that captures well whether the function is in fact used or not, so marking it as __maybe_unused is probably the best way to shut up the warning. The function will then be silently dropped when there is no user. Link: http://lkml.kernel.org/r/20171013142227.1273469-1-arnd@arndb.de Fixes: d8c4deee6dc6 ("tracing: Remove obsolete sched_switch tracer selftest") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 364f78abdf47..eb9ba5c1ba40 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -59,7 +59,7 @@ static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu) * Test the trace buffer to see if all the elements * are still sane. */ -static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) +static int __maybe_unused trace_test_buffer(struct trace_buffer *buf, unsigned long *count) { unsigned long flags, cnt = 0; int cpu, ret = 0; -- cgit From 1bdec44955edc22fb840f5965987d2972307dcc9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 12 Oct 2017 10:34:07 -0700 Subject: bpf: verifier: set reg_type on context accesses in second pass Use a simplified is_valid_access() callback when verifier is used for program analysis by non-host JITs. This allows us to teach the verifier about packet start and packet end offsets for direct packet access. We can extend the callback as needed but for most packet processing needs there isn't much more the offloads may require. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cdbcc4f8f6b..9755279d94cb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -813,6 +813,36 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } +static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off, + struct bpf_insn_access_aux *info) +{ + switch (env->prog->type) { + case BPF_PROG_TYPE_XDP: + switch (off) { + case offsetof(struct xdp_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct xdp_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; + case BPF_PROG_TYPE_SCHED_CLS: + switch (off) { + case offsetof(struct sk_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_data_end, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; + default: + return false; + } +} + /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) @@ -821,12 +851,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, .reg_type = *reg_type, }; - /* for analyzer ctx accesses are already validated and converted */ - if (env->analyzer_ops) - return 0; - - if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + if (env->analyzer_ops) { + if (analyzer_is_valid_access(env, off, &info)) { + *reg_type = info.reg_type; + return 0; + } + } else if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower -- cgit From 9185a610f8f7f1b4e4d28c9de27d1969cf58e0f1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 12 Oct 2017 18:40:02 -0400 Subject: tracing: bpf: Hide bpf trace events when they are not used All the trace events defined in include/trace/events/bpf.h are only used when CONFIG_BPF_SYSCALL is defined. But this file gets included by include/linux/bpf_trace.h which is included by the networking code with CREATE_TRACE_POINTS defined. If a trace event is created but not used it still has data structures and functions created for its use, even though nothing is using them. To not waste space, do not define the BPF trace events in bpf.h unless CONFIG_BPF_SYSCALL is defined. Signed-off-by: Steven Rostedt (VMware) Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 248961af2421..8e7c8bf2b687 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1580,5 +1580,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +/* These are only used within the BPF_SYSCALL code */ +#ifdef CONFIG_BPF_SYSCALL EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); +#endif -- cgit From 8fd0fbbe8888f295eb34172a7e47bf7d3a0a4687 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Oct 2017 09:45:29 +0200 Subject: perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on ftrace:function") Revert commit: 75e8387685f6 ("perf/ftrace: Fix double traces of perf on ftrace:function") The reason I instantly stumbled on that patch is that it only addresses the ftrace situation and doesn't mention the other _5_ places that use this interface. It doesn't explain why those don't have the problem and if not, why their solution doesn't work for ftrace. It doesn't, but this is just putting more duct tape on. Link: http://lkml.kernel.org/r/20171011080224.200565770@infradead.org Cc: Zhou Chengming Cc: Jiri Olsa Cc: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/events/core.c | 13 ++++--------- kernel/trace/trace_event_perf.c | 4 +--- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_syscalls.c | 4 ++-- kernel/trace/trace_uprobe.c | 2 +- 5 files changed, 10 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..b8db80c5513b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7954,15 +7954,16 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, - rctx, task, NULL); + rctx, task); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, - struct task_struct *task, struct perf_event *event) + struct task_struct *task) { struct perf_sample_data data; + struct perf_event *event; struct perf_raw_record raw = { .frag = { @@ -7976,15 +7977,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_trace_buf_update(record, event_type); - /* Use the given event instead of the hlist */ - if (event) { + hlist_for_each_entry_rcu(event, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); - } else { - hlist_for_each_entry_rcu(event, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } } /* diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 13ba2d3f6a91..562fa69df5d3 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -306,7 +306,6 @@ static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *pt_regs) { - struct perf_event *event; struct ftrace_entry *entry; struct hlist_head *head; struct pt_regs regs; @@ -330,9 +329,8 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; - event = container_of(ops, struct perf_event, ftrace_ops); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, - 1, ®s, head, NULL, event); + 1, ®s, head, NULL); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index af6134f2e597..996902a526d4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL, NULL); + head, NULL); } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL, NULL); + head, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 696afe72d3b1..934b0da72679 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -622,7 +622,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, - head, NULL, NULL); + head, NULL); } static int perf_sysenter_enable(struct trace_event_call *call) @@ -716,7 +716,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) } perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, - 1, regs, head, NULL, NULL); + 1, regs, head, NULL); } static int perf_sysexit_enable(struct trace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b34965e62fb9..402120ba4594 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, } perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL, NULL); + head, NULL); out: preempt_enable(); } -- cgit From 466c81c45b650deca213fda3d0ec4761667379a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Oct 2017 17:15:47 +0200 Subject: perf/ftrace: Fix function trace events The function-trace <-> perf interface is a tad messed up. Where all the other trace <-> perf interfaces use a single trace hook registration and use per-cpu RCU based hlist to iterate the events, function-trace actually needs multiple hook registrations in order to minimize function entry patching when filters are present. The end result is that we iterate events both on the trace hook and on the hlist, which results in reporting events multiple times. Since function-trace cannot use the regular scheme, fix it the other way around, use singleton hlists. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 80 +++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 562fa69df5d3..e73f9ab15939 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_event *p_event) int perf_trace_add(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; - struct hlist_head __percpu *pcpu_list; - struct hlist_head *list; - - pcpu_list = tp_event->perf_events; - if (WARN_ON_ONCE(!pcpu_list)) - return -EINVAL; if (!(flags & PERF_EF_START)) p_event->hw.state = PERF_HES_STOPPED; - list = this_cpu_ptr(pcpu_list); - hlist_add_head_rcu(&p_event->hlist_entry, list); + /* + * If TRACE_REG_PERF_ADD returns false; no custom action was performed + * and we need to take the default action of enqueueing our event on + * the right per-cpu hlist. + */ + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) { + struct hlist_head __percpu *pcpu_list; + struct hlist_head *list; + + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) + return -EINVAL; + + list = this_cpu_ptr(pcpu_list); + hlist_add_head_rcu(&p_event->hlist_entry, list); + } - return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); + return 0; } void perf_trace_del(struct perf_event *p_event, int flags) { struct trace_event_call *tp_event = p_event->tp_event; - hlist_del_rcu(&p_event->hlist_entry); - tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); + + /* + * If TRACE_REG_PERF_DEL returns false; no custom action was performed + * and we need to take the default action of dequeueing our event from + * the right per-cpu hlist. + */ + if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event)) + hlist_del_rcu(&p_event->hlist_entry); } void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) @@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *pt_regs) { struct ftrace_entry *entry; - struct hlist_head *head; + struct perf_event *event; + struct hlist_head head; struct pt_regs regs; int rctx; - head = this_cpu_ptr(event_function.perf_events); - if (hlist_empty(head)) + if ((unsigned long)ops->private != smp_processor_id()) return; + event = container_of(ops, struct perf_event, ftrace_ops); + + /* + * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all + * the perf code does is hlist_for_each_entry_rcu(), so we can + * get away with simply setting the @head.first pointer in order + * to create a singular list. + */ + head.first = &event->hlist_entry; + #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ sizeof(u64)) - sizeof(u32)) @@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, - 1, ®s, head, NULL); + 1, ®s, &head, NULL); #undef ENTRY_SIZE } @@ -339,8 +363,10 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU; - ops->func = perf_ftrace_function_call; + ops->flags |= FTRACE_OPS_FL_RCU; + ops->func = perf_ftrace_function_call; + ops->private = (void *)(unsigned long)nr_cpu_ids; + return register_ftrace_function(ops); } @@ -352,19 +378,11 @@ static int perf_ftrace_function_unregister(struct perf_event *event) return ret; } -static void perf_ftrace_function_enable(struct perf_event *event) -{ - ftrace_function_local_enable(&event->ftrace_ops); -} - -static void perf_ftrace_function_disable(struct perf_event *event) -{ - ftrace_function_local_disable(&event->ftrace_ops); -} - int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data) { + struct perf_event *event = data; + switch (type) { case TRACE_REG_REGISTER: case TRACE_REG_UNREGISTER: @@ -377,11 +395,11 @@ int perf_ftrace_event_register(struct trace_event_call *call, case TRACE_REG_PERF_CLOSE: return perf_ftrace_function_unregister(data); case TRACE_REG_PERF_ADD: - perf_ftrace_function_enable(data); - return 0; + event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id(); + return 1; case TRACE_REG_PERF_DEL: - perf_ftrace_function_disable(data); - return 0; + event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids; + return 1; } return -EINVAL; -- cgit From 1dd311e6dcda4020c603bcf9f390a577d439d509 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Oct 2017 09:45:31 +0200 Subject: perf/ftrace: Small cleanup ops->flags _should_ be 0 at this point, so setting the flag using bitwise or is a bit daft. Link: http://lkml.kernel.org/r/20171011080224.315585202@infradead.org Requested-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e73f9ab15939..55d6dff37daf 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -363,7 +363,7 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags |= FTRACE_OPS_FL_RCU; + ops->flags = FTRACE_OPS_FL_RCU; ops->func = perf_ftrace_function_call; ops->private = (void *)(unsigned long)nr_cpu_ids; -- cgit From b3a88803ac5b4bda26017b485c8722a8487fefb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Oct 2017 09:45:32 +0200 Subject: ftrace: Kill FTRACE_OPS_FL_PER_CPU The one and only user of FTRACE_OPS_FL_PER_CPU is gone, remove the lot. Link: http://lkml.kernel.org/r/20171011080224.372422809@infradead.org Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 55 ++++++--------------------------------------------- 1 file changed, 6 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e0a98225666b..2fd3edaec6de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -203,30 +203,6 @@ void clear_ftrace_function(void) ftrace_trace_function = ftrace_stub; } -static void per_cpu_ops_disable_all(struct ftrace_ops *ops) -{ - int cpu; - - for_each_possible_cpu(cpu) - *per_cpu_ptr(ops->disabled, cpu) = 1; -} - -static int per_cpu_ops_alloc(struct ftrace_ops *ops) -{ - int __percpu *disabled; - - if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU))) - return -EINVAL; - - disabled = alloc_percpu(int); - if (!disabled) - return -ENOMEM; - - ops->disabled = disabled; - per_cpu_ops_disable_all(ops); - return 0; -} - static void ftrace_sync(struct work_struct *work) { /* @@ -262,8 +238,8 @@ static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) * If this is a dynamic, RCU, or per CPU ops, or we force list func, * then it needs to call the list anyway. */ - if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU | - FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC) + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) || + FTRACE_FORCE_LIST_FUNC) return ftrace_ops_list_func; return ftrace_ops_get_func(ops); @@ -422,11 +398,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_PER_CPU) { - if (per_cpu_ops_alloc(ops)) - return -ENOMEM; - } - add_ftrace_ops(&ftrace_ops_list, ops); /* Always save the function, and reset at unregistering */ @@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } -static void per_cpu_ops_free(struct ftrace_ops *ops) -{ - free_percpu(ops->disabled); -} - static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) goto free_ops; return 0; @@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * The same goes for freeing the per_cpu data of the per_cpu * ops. */ - if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) { /* * We need to do a hard force of sched synchronization. * This is because we use preempt_disable() to do RCU, but @@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) free_ops: arch_ftrace_trampoline_free(ops); - - if (ops->flags & FTRACE_OPS_FL_PER_CPU) - per_cpu_ops_free(ops); } return 0; @@ -6355,10 +6318,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * If any of the above fails then the op->func() is not executed. */ if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) && - (!(op->flags & FTRACE_OPS_FL_PER_CPU) || - !ftrace_function_local_disabled(op)) && ftrace_ops_test(op, ip, regs)) { - if (FTRACE_WARN_ON(!op->func)) { pr_warn("op=%p %pS\n", op, op); goto out; @@ -6416,10 +6376,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); - if (!(op->flags & FTRACE_OPS_FL_PER_CPU) || - !ftrace_function_local_disabled(op)) { - op->func(ip, parent_ip, op, regs); - } + op->func(ip, parent_ip, op, regs); preempt_enable_notrace(); trace_clear_recursion(bit); @@ -6443,7 +6400,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) * or does per cpu logic, then we need to call the assist handler. */ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) || - ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU)) + ops->flags & FTRACE_OPS_FL_RCU) return ftrace_ops_assist_func; return ops->func; -- cgit From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:28 +0200 Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP The 'cpumap' is primarily used as a backend map for XDP BPF helper call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. This patch implement the main part of the map. It is not connected to the XDP redirect system yet, and no SKB allocation are done yet. The main concern in this patch is to ensure the datapath can run without any locking. This adds complexity to the setup and tear-down procedure, which assumptions are extra carefully documented in the code comments. V2: - make sure array isn't larger than NR_CPUS - make sure CPUs added is a valid possible CPU V3: fix nitpicks from Jakub Kicinski V5: - Restrict map allocation to root / CAP_SYS_ADMIN - WARN_ON_ONCE if queue is not empty on tear-down - Return -EPERM on memlock limit instead of -ENOMEM - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup() - Moved cpu_map_enqueue() to next patch V6: all notice by Daniel Borkmann - Fix err return code in cpu_map_alloc() introduced in V5 - Move cpu_possible() check after max_entries boundary check - Forbid usage initially in check_map_func_compatibility() V7: - Fix alloc error path spotted by Daniel Borkmann - Did stress test adding+removing CPUs from the map concurrently - Fixed refcnt issue on cpu_map_entry, kthread started too soon - Make sure packets are flushed during tear-down, involved use of rcu_barrier() and kthread_run only exit after queue is empty - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring V8: - Nitpicking comments and gramma by Edward Cree - Fix missing semi-colon introduced in V7 due to rebasing - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch Signed-off-by: Jesper Dangaard Brouer Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 1 + kernel/bpf/cpumap.c | 560 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 8 +- kernel/bpf/verifier.c | 5 + 4 files changed, 573 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/cpumap.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 53fb09f92e3f..e597daae6120 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o +obj-$(CONFIG_BPF_SYSCALL) += cpumap.o ifeq ($(CONFIG_STREAM_PARSER),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c new file mode 100644 index 000000000000..e1e25ddba038 --- /dev/null +++ b/kernel/bpf/cpumap.c @@ -0,0 +1,560 @@ +/* bpf/cpumap.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ + +/* The 'cpumap' is primarily used as a backend map for XDP BPF helper + * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. + * + * Unlike devmap which redirects XDP frames out another NIC device, + * this map type redirects raw XDP frames to another CPU. The remote + * CPU will do SKB-allocation and call the normal network stack. + * + * This is a scalability and isolation mechanism, that allow + * separating the early driver network XDP layer, from the rest of the + * netstack, and assigning dedicated CPUs for this stage. This + * basically allows for 10G wirespeed pre-filtering via bpf. + */ +#include +#include +#include + +#include +#include +#include +#include + +/* General idea: XDP packets getting XDP redirected to another CPU, + * will maximum be stored/queued for one driver ->poll() call. It is + * guaranteed that setting flush bit and flush operation happen on + * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() + * which queue in bpf_cpu_map_entry contains packets. + */ + +#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct xdp_bulk_queue { + void *q[CPU_MAP_BULK_SIZE]; + unsigned int count; +}; + +/* Struct for every remote "destination" CPU in map */ +struct bpf_cpu_map_entry { + u32 qsize; /* Queue size placeholder for map lookup */ + + /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ + struct xdp_bulk_queue __percpu *bulkq; + + /* Queue with potential multi-producers, and single-consumer kthread */ + struct ptr_ring *queue; + struct task_struct *kthread; + struct work_struct kthread_stop_wq; + + atomic_t refcnt; /* Control when this struct can be free'ed */ + struct rcu_head rcu; +}; + +struct bpf_cpu_map { + struct bpf_map map; + /* Below members specific for map type */ + struct bpf_cpu_map_entry **cpu_map; + unsigned long __percpu *flush_needed; +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, + struct xdp_bulk_queue *bq); + +static u64 cpu_map_bitmap_size(const union bpf_attr *attr) +{ + return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); +} + +static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) +{ + struct bpf_cpu_map *cmap; + int err = -ENOMEM; + u64 cost; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + return ERR_PTR(-EINVAL); + + cmap = kzalloc(sizeof(*cmap), GFP_USER); + if (!cmap) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + cmap->map.map_type = attr->map_type; + cmap->map.key_size = attr->key_size; + cmap->map.value_size = attr->value_size; + cmap->map.max_entries = attr->max_entries; + cmap->map.map_flags = attr->map_flags; + cmap->map.numa_node = bpf_map_attr_numa_node(attr); + + /* Pre-limit array size based on NR_CPUS, not final CPU check */ + if (cmap->map.max_entries > NR_CPUS) { + err = -E2BIG; + goto free_cmap; + } + + /* make sure page count doesn't overflow */ + cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); + cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_cmap; + cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + ret = bpf_map_precharge_memlock(cmap->map.pages); + if (ret) { + err = ret; + goto free_cmap; + } + + /* A per cpu bitfield with a bit per possible CPU in map */ + cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), + __alignof__(unsigned long)); + if (!cmap->flush_needed) + goto free_cmap; + + /* Alloc array for possible remote "destination" CPUs */ + cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * + sizeof(struct bpf_cpu_map_entry *), + cmap->map.numa_node); + if (!cmap->cpu_map) + goto free_percpu; + + return &cmap->map; +free_percpu: + free_percpu(cmap->flush_needed); +free_cmap: + kfree(cmap); + return ERR_PTR(err); +} + +void __cpu_map_queue_destructor(void *ptr) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + if (WARN_ON_ONCE(ptr)) + page_frag_free(ptr); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); + kfree(rcpu->queue); + kfree(rcpu); + } +} + +static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + atomic_inc(&rcpu->refcnt); +} + +/* called from workqueue, to workaround syscall using preempt_disable */ +static void cpu_map_kthread_stop(struct work_struct *work) +{ + struct bpf_cpu_map_entry *rcpu; + + rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); + + /* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, + * as it waits until all in-flight call_rcu() callbacks complete. + */ + rcu_barrier(); + + /* kthread_stop will wake_up_process and wait for it to complete */ + kthread_stop(rcpu->kthread); +} + +static int cpu_map_kthread_run(void *data) +{ + struct bpf_cpu_map_entry *rcpu = data; + + set_current_state(TASK_INTERRUPTIBLE); + + /* When kthread gives stop order, then rcpu have been disconnected + * from map, thus no new packets can enter. Remaining in-flight + * per CPU stored packets are flushed to this queue. Wait honoring + * kthread_stop signal until queue is empty. + */ + while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + struct xdp_pkt *xdp_pkt; + + schedule(); + /* Do work */ + while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) { + /* For now just "refcnt-free" */ + page_frag_free(xdp_pkt); + } + __set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + put_cpu_map_entry(rcpu); + return 0; +} + +struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +{ + gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; + struct bpf_cpu_map_entry *rcpu; + int numa, err; + + /* Have map->numa_node, but choose node of redirect target CPU */ + numa = cpu_to_node(cpu); + + rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); + if (!rcpu) + return NULL; + + /* Alloc percpu bulkq */ + rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), + sizeof(void *), gfp); + if (!rcpu->bulkq) + goto free_rcu; + + /* Alloc queue */ + rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); + if (!rcpu->queue) + goto free_bulkq; + + err = ptr_ring_init(rcpu->queue, qsize, gfp); + if (err) + goto free_queue; + + rcpu->qsize = qsize; + + /* Setup kthread */ + rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, + "cpumap/%d/map:%d", cpu, map_id); + if (IS_ERR(rcpu->kthread)) + goto free_ptr_ring; + + get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ + get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ + + /* Make sure kthread runs on a single CPU */ + kthread_bind(rcpu->kthread, cpu); + wake_up_process(rcpu->kthread); + + return rcpu; + +free_ptr_ring: + ptr_ring_cleanup(rcpu->queue, NULL); +free_queue: + kfree(rcpu->queue); +free_bulkq: + free_percpu(rcpu->bulkq); +free_rcu: + kfree(rcpu); + return NULL; +} + +void __cpu_map_entry_free(struct rcu_head *rcu) +{ + struct bpf_cpu_map_entry *rcpu; + int cpu; + + /* This cpu_map_entry have been disconnected from map and one + * RCU graze-period have elapsed. Thus, XDP cannot queue any + * new packets and cannot change/set flush_needed that can + * find this entry. + */ + rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); + + /* Flush remaining packets in percpu bulkq */ + for_each_online_cpu(cpu) { + struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); + + /* No concurrent bq_enqueue can run at this point */ + bq_flush_to_queue(rcpu, bq); + } + free_percpu(rcpu->bulkq); + /* Cannot kthread_stop() here, last put free rcpu resources */ + put_cpu_map_entry(rcpu); +} + +/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to + * ensure any driver rcu critical sections have completed, but this + * does not guarantee a flush has happened yet. Because driver side + * rcu_read_lock/unlock only protects the running XDP program. The + * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a + * pending flush op doesn't fail. + * + * The bpf_cpu_map_entry is still used by the kthread, and there can + * still be pending packets (in queue and percpu bulkq). A refcnt + * makes sure to last user (kthread_stop vs. call_rcu) free memory + * resources. + * + * The rcu callback __cpu_map_entry_free flush remaining packets in + * percpu bulkq to queue. Due to caller map_delete_elem() disable + * preemption, cannot call kthread_stop() to make sure queue is empty. + * Instead a work_queue is started for stopping kthread, + * cpu_map_kthread_stop, which waits for an RCU graze period before + * stopping kthread, emptying the queue. + */ +void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +{ + struct bpf_cpu_map_entry *old_rcpu; + + old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu); + if (old_rcpu) { + call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); + INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); + schedule_work(&old_rcpu->kthread_stop_wq); + } +} + +int cpu_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + u32 key_cpu = *(u32 *)key; + + if (key_cpu >= map->max_entries) + return -EINVAL; + + /* notice caller map_delete_elem() use preempt_disable() */ + __cpu_map_entry_replace(cmap, key_cpu, NULL); + return 0; +} + +int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpu_map_entry *rcpu; + + /* Array index key correspond to CPU number */ + u32 key_cpu = *(u32 *)key; + /* Value is the queue size */ + u32 qsize = *(u32 *)value; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(key_cpu >= cmap->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + if (unlikely(qsize > 16384)) /* sanity limit on qsize */ + return -EOVERFLOW; + + /* Make sure CPU is a valid possible cpu */ + if (!cpu_possible(key_cpu)) + return -ENODEV; + + if (qsize == 0) { + rcpu = NULL; /* Same as deleting */ + } else { + /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ + rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); + if (!rcpu) + return -ENOMEM; + } + rcu_read_lock(); + __cpu_map_entry_replace(cmap, key_cpu, rcpu); + rcu_read_unlock(); + return 0; +} + +void cpu_map_free(struct bpf_map *map) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + int cpu; + u32 i; + + /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the bpf programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete. The rcu critical section only guarantees + * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. + * It does __not__ ensure pending flush operations (if any) are + * complete. + */ + synchronize_rcu(); + + /* To ensure all pending flush operations have completed wait for flush + * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * Because the above synchronize_rcu() ensures the map is disconnected + * from the program we can assume no new bits will be set. + */ + for_each_online_cpu(cpu) { + unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + + while (!bitmap_empty(bitmap, cmap->map.max_entries)) + cond_resched(); + } + + /* For cpu_map the remote CPUs can still be using the entries + * (struct bpf_cpu_map_entry). + */ + for (i = 0; i < cmap->map.max_entries; i++) { + struct bpf_cpu_map_entry *rcpu; + + rcpu = READ_ONCE(cmap->cpu_map[i]); + if (!rcpu) + continue; + + /* bq flush and cleanup happens after RCU graze-period */ + __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ + } + free_percpu(cmap->flush_needed); + bpf_map_area_free(cmap->cpu_map); + kfree(cmap); +} + +struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + struct bpf_cpu_map_entry *rcpu; + + if (key >= map->max_entries) + return NULL; + + rcpu = READ_ONCE(cmap->cpu_map[key]); + return rcpu; +} + +static void *cpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_cpu_map_entry *rcpu = + __cpu_map_lookup_elem(map, *(u32 *)key); + + return rcpu ? &rcpu->qsize : NULL; +} + +static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= cmap->map.max_entries) { + *next = 0; + return 0; + } + + if (index == cmap->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +const struct bpf_map_ops cpu_map_ops = { + .map_alloc = cpu_map_alloc, + .map_free = cpu_map_free, + .map_delete_elem = cpu_map_delete_elem, + .map_update_elem = cpu_map_update_elem, + .map_lookup_elem = cpu_map_lookup_elem, + .map_get_next_key = cpu_map_get_next_key, +}; + +static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, + struct xdp_bulk_queue *bq) +{ + struct ptr_ring *q; + int i; + + if (unlikely(!bq->count)) + return 0; + + q = rcpu->queue; + spin_lock(&q->producer_lock); + + for (i = 0; i < bq->count; i++) { + void *xdp_pkt = bq->q[i]; + int err; + + err = __ptr_ring_produce(q, xdp_pkt); + if (err) { + /* Free xdp_pkt */ + page_frag_free(xdp_pkt); + } + } + bq->count = 0; + spin_unlock(&q->producer_lock); + + return 0; +} + +/* Notice: Will change in later patch */ +struct xdp_pkt { + void *data; + u16 len; + u16 headroom; +}; + +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); + + if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) + bq_flush_to_queue(rcpu, bq); + + /* Notice, xdp_buff/page MUST be queued here, long enough for + * driver to code invoking us to finished, due to driver + * (e.g. ixgbe) recycle tricks based on page-refcnt. + * + * Thus, incoming xdp_pkt is always queued here (else we race + * with another CPU on page-refcnt and remaining driver code). + * Queue time is very short, as driver will invoke flush + * operation, when completing napi->poll call. + */ + bq->q[bq->count++] = xdp_pkt; + return 0; +} + +void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + + __set_bit(bit, bitmap); +} + +void __cpu_map_flush(struct bpf_map *map) +{ + struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); + unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); + u32 bit; + + /* The napi->poll softirq makes sure __cpu_map_insert_ctx() + * and __cpu_map_flush() happen on same CPU. Thus, the percpu + * bitmap indicate which percpu bulkq have packets. + */ + for_each_set_bit(bit, bitmap, map->max_entries) { + struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); + struct xdp_bulk_queue *bq; + + /* This is possible if entry is removed by user space + * between xdp redirect and flush op. + */ + if (unlikely(!rcpu)) + continue; + + __clear_bit(bit, bitmap); + + /* Flush all frames in bulkq to real queue */ + bq = this_cpu_ptr(rcpu->bulkq); + bq_flush_to_queue(rcpu, bq); + + /* If already running, costs spin_lock_irqsave + smb_mb */ + wake_up_process(rcpu->kthread); + } +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d124e702e040..54fba06942f5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -592,6 +592,12 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; + /* Need to create a kthread, thus must support schedule */ + if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + err = map->ops->map_update_elem(map, key, value, attr->flags); + goto out; + } + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from * inside bpf map update or delete otherwise deadlocks are possible */ @@ -622,7 +628,7 @@ static int map_update_elem(union bpf_attr *attr) } __this_cpu_dec(bpf_prog_active); preempt_enable(); - +out: if (!err) trace_bpf_map_update_elem(map, ufd, key, value); free_value: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9755279d94cb..cefa64be9a2f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1444,6 +1444,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_redirect_map) goto error; break; + /* Restrict bpf side of cpumap, open when use-cases appear */ + case BPF_MAP_TYPE_CPUMAP: + if (func_id != BPF_FUNC_redirect_map) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) -- cgit From 9c270af37bb62e708e3e4415d653ce73e713df02 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:34 +0200 Subject: bpf: XDP_REDIRECT enable use of cpumap This patch connects cpumap to the xdp_do_redirect_map infrastructure. Still no SKB allocation are done yet. The XDP frames are transferred to the other CPU, but they are simply refcnt decremented on the remote CPU. This served as a good benchmark for measuring the overhead of remote refcnt decrement. If driver page recycle cache is not efficient then this, exposes a bottleneck in the page allocator. A shout-out to MST's ptr_ring, which is the secret behind is being so efficient to transfer memory pointers between CPUs, without constantly bouncing cache-lines between CPUs. V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot. V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet, as implementation require a separate upstream discussion. V5: - Fix a maybe-uninitialized pointed out by kbuild test robot. - Restrict bpf-prog side access to cpumap, open when use-cases appear - Implement cpu_map_enqueue() as a more simple void pointer enqueue V6: - Allow cpumap type for usage in helper bpf_redirect_map, general bpf-prog side restriction moved to earlier patch. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 22 +++++++++++++++++++++- kernel/bpf/verifier.c | 3 ++- 2 files changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e1e25ddba038..768da6a2c265 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -500,7 +500,7 @@ struct xdp_pkt { /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) { struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -520,6 +520,26 @@ int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) return 0; } +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct xdp_pkt *xdp_pkt; + int headroom; + + /* For now this is just used as a void pointer to data_hard_start. + * Followup patch will generalize this. + */ + xdp_pkt = xdp->data_hard_start; + + /* Fake writing into xdp_pkt->data to measure overhead */ + headroom = xdp->data - xdp->data_hard_start; + if (headroom < sizeof(*xdp_pkt)) + xdp_pkt->data = xdp->data; + + bq_enqueue(rcpu, xdp_pkt); + return 0; +} + void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cefa64be9a2f..e4d5136725a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1486,7 +1486,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_redirect_map: - if (map->map_type != BPF_MAP_TYPE_DEVMAP) + if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_CPUMAP) goto error; break; case BPF_FUNC_sk_redirect_map: -- cgit From 1c601d829ab0d7ac3ac44853f83db2206afe67fc Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:39 +0200 Subject: bpf: cpumap xdp_buff to skb conversion and allocation This patch makes cpumap functional, by adding SKB allocation and invoking the network stack on the dequeuing CPU. For constructing the SKB on the remote CPU, the xdp_buff in converted into a struct xdp_pkt, and it mapped into the top headroom of the packet, to avoid allocating separate mem. For now, struct xdp_pkt is just a cpumap internal data structure, with info carried between enqueue to dequeue. If a driver doesn't have enough headroom it is simply dropped, with return code -EOVERFLOW. This will be picked up the xdp tracepoint infrastructure, to allow users to catch this. V2: take into account xdp->data_meta V4: - Drop busypoll tricks, keeping it more simple. - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei V5: correct RCU read protection around __netif_receive_skb_core. V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 152 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 130 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 768da6a2c265..ee7adf4352dd 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -25,6 +25,9 @@ #include #include +#include /* netif_receive_skb_core */ +#include /* eth_type_trans */ + /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is * guaranteed that setting flush bit and flush operation happen on @@ -179,6 +182,92 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } +/* For now, xdp_pkt is a cpumap internal data structure, with info + * carried between enqueue to dequeue. It is mapped into the top + * headroom of the packet, to avoid allocating separate mem. + */ +struct xdp_pkt { + void *data; + u16 len; + u16 headroom; + u16 metasize; + struct net_device *dev_rx; +}; + +/* Convert xdp_buff to xdp_pkt */ +static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) +{ + struct xdp_pkt *xdp_pkt; + int metasize; + int headroom; + + /* Assure headroom is available for storing info */ + headroom = xdp->data - xdp->data_hard_start; + metasize = xdp->data - xdp->data_meta; + metasize = metasize > 0 ? metasize : 0; + if ((headroom - metasize) < sizeof(*xdp_pkt)) + return NULL; + + /* Store info in top of packet */ + xdp_pkt = xdp->data_hard_start; + + xdp_pkt->data = xdp->data; + xdp_pkt->len = xdp->data_end - xdp->data; + xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); + xdp_pkt->metasize = metasize; + + return xdp_pkt; +} + +struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) +{ + unsigned int frame_size; + void *pkt_data_start; + struct sk_buff *skb; + + /* build_skb need to place skb_shared_info after SKB end, and + * also want to know the memory "truesize". Thus, need to + * know the memory frame size backing xdp_buff. + * + * XDP was designed to have PAGE_SIZE frames, but this + * assumption is not longer true with ixgbe and i40e. It + * would be preferred to set frame_size to 2048 or 4096 + * depending on the driver. + * frame_size = 2048; + * frame_len = frame_size - sizeof(*xdp_pkt); + * + * Instead, with info avail, skb_shared_info in placed after + * packet len. This, unfortunately fakes the truesize. + * Another disadvantage of this approach, the skb_shared_info + * is not at a fixed memory location, with mixed length + * packets, which is bad for cache-line hotness. + */ + frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + + pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + skb = build_skb(pkt_data_start, frame_size); + if (!skb) + return NULL; + + skb_reserve(skb, xdp_pkt->headroom); + __skb_put(skb, xdp_pkt->len); + if (xdp_pkt->metasize) + skb_metadata_set(skb, xdp_pkt->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + return skb; +} + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -191,15 +280,45 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { + unsigned int processed = 0, drops = 0; struct xdp_pkt *xdp_pkt; - schedule(); - /* Do work */ - while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) { - /* For now just "refcnt-free" */ - page_frag_free(xdp_pkt); + /* Release CPU reschedule checks */ + if (__ptr_ring_empty(rcpu->queue)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } else { + cond_resched(); + } + __set_current_state(TASK_RUNNING); + + /* Process packets in rcpu->queue */ + local_bh_disable(); + /* + * The bpf_cpu_map_entry is single consumer, with this + * kthread CPU pinned. Lockless access to ptr_ring + * consume side valid as no-resize allowed of queue. + */ + while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + struct sk_buff *skb; + int ret; + + skb = cpu_map_build_skb(rcpu, xdp_pkt); + if (!skb) { + page_frag_free(xdp_pkt); + continue; + } + + /* Inject into network stack */ + ret = netif_receive_skb_core(skb); + if (ret == NET_RX_DROP) + drops++; + + /* Limit BH-disable period */ + if (++processed == 8) + break; } - __set_current_state(TASK_INTERRUPTIBLE); + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -490,13 +609,6 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, return 0; } -/* Notice: Will change in later patch */ -struct xdp_pkt { - void *data; - u16 len; - u16 headroom; -}; - /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ @@ -524,17 +636,13 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) { struct xdp_pkt *xdp_pkt; - int headroom; - /* For now this is just used as a void pointer to data_hard_start. - * Followup patch will generalize this. - */ - xdp_pkt = xdp->data_hard_start; + xdp_pkt = convert_to_xdp_pkt(xdp); + if (!xdp_pkt) + return -EOVERFLOW; - /* Fake writing into xdp_pkt->data to measure overhead */ - headroom = xdp->data - xdp->data_hard_start; - if (headroom < sizeof(*xdp_pkt)) - xdp_pkt->data = xdp->data; + /* Info needed when constructing SKB on remote CPU */ + xdp_pkt->dev_rx = dev_rx; bq_enqueue(rcpu, xdp_pkt); return 0; -- cgit From f9419f7bd7a5318b636a941a0214c5cdfa6f6530 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 16 Oct 2017 12:19:44 +0200 Subject: bpf: cpumap add tracepoints This adds two tracepoint to the cpumap. One for the enqueue side trace_xdp_cpumap_enqueue() and one for the kthread dequeue side trace_xdp_cpumap_kthread(). To mitigate the tracepoint overhead, these are invoked during the enqueue/dequeue bulking phases, thus amortizing the cost. The obvious use-cases are for debugging and monitoring. The non-intuitive use-case is using these as a feedback loop to know the system load. One can imagine auto-scaling by reducing, adding or activating more worker CPUs on demand. V4: tracepoint remove time_limit info, instead add sched info V8: intro struct bpf_cpu_map_entry members cpu+map_id in this patch Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ee7adf4352dd..b4358d84ddf1 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* netif_receive_skb_core */ #include /* eth_type_trans */ @@ -43,6 +44,8 @@ struct xdp_bulk_queue { /* Struct for every remote "destination" CPU in map */ struct bpf_cpu_map_entry { + u32 cpu; /* kthread CPU and map index */ + int map_id; /* Back reference to map */ u32 qsize; /* Queue size placeholder for map lookup */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ @@ -280,15 +283,16 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { - unsigned int processed = 0, drops = 0; + unsigned int processed = 0, drops = 0, sched = 0; struct xdp_pkt *xdp_pkt; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); + sched = 1; } else { - cond_resched(); + sched = cond_resched(); } __set_current_state(TASK_RUNNING); @@ -318,6 +322,9 @@ static int cpu_map_kthread_run(void *data) if (++processed == 8) break; } + /* Feedback loop via tracepoint */ + trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + local_bh_enable(); /* resched point, may call do_softirq() */ } __set_current_state(TASK_RUNNING); @@ -354,7 +361,9 @@ struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) if (err) goto free_queue; - rcpu->qsize = qsize; + rcpu->cpu = cpu; + rcpu->map_id = map_id; + rcpu->qsize = qsize; /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, @@ -584,6 +593,8 @@ const struct bpf_map_ops cpu_map_ops = { static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, struct xdp_bulk_queue *bq) { + unsigned int processed = 0, drops = 0; + const int to_cpu = rcpu->cpu; struct ptr_ring *q; int i; @@ -599,13 +610,16 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdp_pkt); if (err) { - /* Free xdp_pkt */ - page_frag_free(xdp_pkt); + drops++; + page_frag_free(xdp_pkt); /* Free xdp_pkt */ } + processed++; } bq->count = 0; spin_unlock(&q->producer_lock); + /* Feedback loop via tracepoints */ + trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; } -- cgit From 5ffeb0501c6b36d080de78372fdb70b404b91e9d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 25 Jul 2016 16:07:10 +0100 Subject: genirq: export irq_get_percpu_devid_partition to modules Any modular driver using cluster-affine PPIs needs to be able to call irq_get_percpu_devid_partition so that it can enable the IRQ on the correct subset of CPUs. This patch exports the symbol so that it can be called from within a module. Acked-by: Marc Zyngier Acked-by: Thomas Gleixner Signed-off-by: Will Deacon --- kernel/irq/irqdesc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 82afb7ed369f..694c1a9d6485 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -863,6 +863,7 @@ int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) return 0; } +EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); void kstat_incr_irq_this_cpu(unsigned int irq) { -- cgit From bc1d202023eb66f088f736ba423bee1cf135c720 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 16 Aug 2016 16:53:15 +0100 Subject: perf/core: Export AUX buffer helpers to modules Perf PMU drivers using AUX buffers cannot be built as modules unless the AUX helpers are exported. This patch exports perf_aux_output_{begin,end,skip} and perf_get_aux to modules. Cc: Peter Zijlstra Signed-off-by: Will Deacon --- kernel/events/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index f684d8e5fa2b..6d9bffe4d6cc 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -411,6 +411,7 @@ err: return NULL; } +EXPORT_SYMBOL_GPL(perf_aux_output_begin); static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) { @@ -480,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) rb_free_aux(rb); ring_buffer_put(rb); } +EXPORT_SYMBOL_GPL(perf_aux_output_end); /* * Skip over a given number of bytes in the AUX buffer, due to, for example, @@ -505,6 +507,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) return 0; } +EXPORT_SYMBOL_GPL(perf_aux_output_skip); void *perf_get_aux(struct perf_output_handle *handle) { @@ -514,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle) return handle->rb->aux_priv; } +EXPORT_SYMBOL_GPL(perf_get_aux); #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) -- cgit From 7de16e3a35578f4f5accc6f5f23970310483d0a2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:53 -0700 Subject: bpf: split verifier and program ops struct bpf_verifier_ops contains both verifier ops and operations used later during program's lifetime (test_run). Split the runtime ops into a different structure. BPF_PROG_TYPE() will now append ## _prog_ops or ## _verifier_ops to the names. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 16 +++++++++++++--- kernel/bpf/verifier.c | 12 ++++++------ kernel/trace/bpf_trace.c | 15 ++++++++++++--- 3 files changed, 31 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 54fba06942f5..444902b5a30d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -739,9 +739,18 @@ err_put: return err; } -static const struct bpf_verifier_ops * const bpf_prog_types[] = { -#define BPF_PROG_TYPE(_id, _ops) \ - [_id] = &_ops, +static const struct bpf_prog_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _prog_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; + +static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _verifier_ops, #define BPF_MAP_TYPE(_id, _ops) #include #undef BPF_PROG_TYPE @@ -754,6 +763,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) return -EINVAL; prog->aux->ops = bpf_prog_types[type]; + prog->aux->vops = bpf_verifier_ops[type]; prog->type = type; return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e4d5136725a2..38e24d69fc95 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -856,8 +856,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, *reg_type = info.reg_type; return 0; } - } else if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + } else if (env->prog->aux->vops->is_valid_access && + env->prog->aux->vops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -1565,8 +1565,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } - if (env->prog->aux->ops->get_func_proto) - fn = env->prog->aux->ops->get_func_proto(func_id); + if (env->prog->aux->vops->get_func_proto) + fn = env->prog->aux->vops->get_func_proto(func_id); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), @@ -4035,7 +4035,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { - const struct bpf_verifier_ops *ops = env->prog->aux->ops; + const struct bpf_verifier_ops *ops = env->prog->aux->vops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; @@ -4236,7 +4236,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = prog->aux->ops->get_func_proto(insn->imm); + fn = prog->aux->vops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 04ea5314f2bc..3126da2f468a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -561,11 +561,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -const struct bpf_verifier_ops kprobe_prog_ops = { +const struct bpf_verifier_ops kprobe_verifier_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; +const struct bpf_prog_ops kprobe_prog_ops = { +}; + BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { @@ -667,11 +670,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -const struct bpf_verifier_ops tracepoint_prog_ops = { +const struct bpf_verifier_ops tracepoint_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -727,8 +733,11 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -const struct bpf_verifier_ops perf_event_prog_ops = { +const struct bpf_verifier_ops perf_event_verifier_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; + +const struct bpf_prog_ops perf_event_prog_ops = { +}; -- cgit From 00176a34d9e27ab1e77db75fe13abc005cffe0ca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:54 -0700 Subject: bpf: remove the verifier ops from program structure Since the verifier ops don't have to be associated with the program for its entire lifetime we can move it to verifier's struct bpf_verifier_env. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 10 ---------- kernel/bpf/verifier.c | 23 +++++++++++++++++------ 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 444902b5a30d..0e893cac6795 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -748,22 +748,12 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = { #undef BPF_MAP_TYPE }; -static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { -#define BPF_PROG_TYPE(_id, _name) \ - [_id] = & _name ## _verifier_ops, -#define BPF_MAP_TYPE(_id, _ops) -#include -#undef BPF_PROG_TYPE -#undef BPF_MAP_TYPE -}; - static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) return -EINVAL; prog->aux->ops = bpf_prog_types[type]; - prog->aux->vops = bpf_verifier_ops[type]; prog->type = type; return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 38e24d69fc95..3b6e2c550e96 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23,6 +23,15 @@ #include "disasm.h" +static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { +#define BPF_PROG_TYPE(_id, _name) \ + [_id] = & _name ## _verifier_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -856,8 +865,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, *reg_type = info.reg_type; return 0; } - } else if (env->prog->aux->vops->is_valid_access && - env->prog->aux->vops->is_valid_access(off, size, t, &info)) { + } else if (env->ops->is_valid_access && + env->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -1565,8 +1574,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } - if (env->prog->aux->vops->get_func_proto) - fn = env->prog->aux->vops->get_func_proto(func_id); + if (env->ops->get_func_proto) + fn = env->ops->get_func_proto(func_id); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), @@ -4035,7 +4044,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { - const struct bpf_verifier_ops *ops = env->prog->aux->vops; + const struct bpf_verifier_ops *ops = env->ops; int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; @@ -4236,7 +4245,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = prog->aux->vops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ @@ -4294,6 +4303,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->insn_aux_data) goto err_free_env; env->prog = *prog; + env->ops = bpf_verifier_ops[env->prog->type]; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -4406,6 +4416,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, if (!env->insn_aux_data) goto err_free_env; env->prog = prog; + env->ops = bpf_verifier_ops[env->prog->type]; env->analyzer_ops = ops; env->analyzer_priv = priv; -- cgit From 4f9218aaf8a463f76cac40aa08d859d065f8cc9e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 16 Oct 2017 16:40:55 -0700 Subject: bpf: move knowledge about post-translation offsets out of verifier Use the fact that verifier ops are now separate from program ops to define a separate set of callbacks for verification of already translated programs. Since we expect the analyzer ops to be defined only for a small subset of all program types initialize their array by hand (don't use linux/bpf_types.h). Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 55 +++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3b6e2c550e96..545b8c45a578 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -822,36 +822,6 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, return err; } -static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off, - struct bpf_insn_access_aux *info) -{ - switch (env->prog->type) { - case BPF_PROG_TYPE_XDP: - switch (off) { - case offsetof(struct xdp_buff, data): - info->reg_type = PTR_TO_PACKET; - return true; - case offsetof(struct xdp_buff, data_end): - info->reg_type = PTR_TO_PACKET_END; - return true; - } - return false; - case BPF_PROG_TYPE_SCHED_CLS: - switch (off) { - case offsetof(struct sk_buff, data): - info->reg_type = PTR_TO_PACKET; - return true; - case offsetof(struct sk_buff, cb) + - offsetof(struct bpf_skb_data_end, data_end): - info->reg_type = PTR_TO_PACKET_END; - return true; - } - return false; - default: - return false; - } -} - /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) @@ -860,13 +830,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, .reg_type = *reg_type, }; - if (env->analyzer_ops) { - if (analyzer_is_valid_access(env, off, &info)) { - *reg_type = info.reg_type; - return 0; - } - } else if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, &info)) { + if (env->ops->is_valid_access && + env->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -874,9 +839,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, * will only allow for whole field access and rejects any other * type of narrower access. */ - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; *reg_type = info.reg_type; + if (env->analyzer_ops) + return 0; + + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -4400,12 +4368,21 @@ err_free_env: return ret; } +static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = { + [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops, + [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops, +}; + int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, void *priv) { struct bpf_verifier_env *env; int ret; + if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) || + !bpf_analyzer_ops[prog->type]) + return -EOPNOTSUPP; + env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -4416,7 +4393,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, if (!env->insn_aux_data) goto err_free_env; env->prog = prog; - env->ops = bpf_verifier_ops[env->prog->type]; + env->ops = bpf_analyzer_ops[env->prog->type]; env->analyzer_ops = ops; env->analyzer_priv = priv; -- cgit From 93862e385ded7c60351e09fcd2a541d273650905 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 13 Oct 2017 15:08:41 -0400 Subject: livepatch: add (un)patch callbacks Provide livepatch modules a klp_object (un)patching notification mechanism. Pre and post-(un)patch callbacks allow livepatch modules to setup or synchronize changes that would be difficult to support in only patched-or-unpatched code contexts. Callbacks can be registered for target module or vmlinux klp_objects, but each implementation is klp_object specific. - Pre-(un)patch callbacks run before any (un)patching transition starts. - Post-(un)patch callbacks run once an object has been (un)patched and the klp_patch fully transitioned to its target state. Example use cases include modification of global data and registration of newly available services/handlers. See Documentation/livepatch/callbacks.txt for details and samples/livepatch/ for examples. Signed-off-by: Joe Lawrence Acked-by: Josh Poimboeuf Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 51 ++++++++++++++++++++++++++++++++++--------- kernel/livepatch/core.h | 38 ++++++++++++++++++++++++++++++++ kernel/livepatch/patch.c | 1 + kernel/livepatch/transition.c | 21 +++++++++++++++--- 4 files changed, 98 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index b9628e43c78f..cafb5a84417d 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -54,11 +54,6 @@ static bool klp_is_module(struct klp_object *obj) return obj->name; } -static bool klp_is_object_loaded(struct klp_object *obj) -{ - return !obj->name || obj->mod; -} - /* sets obj->mod if object is not vmlinux and module is found */ static void klp_find_object_module(struct klp_object *obj) { @@ -285,6 +280,8 @@ static int klp_write_object_relocations(struct module *pmod, static int __klp_disable_patch(struct klp_patch *patch) { + struct klp_object *obj; + if (klp_transition_patch) return -EBUSY; @@ -295,6 +292,10 @@ static int __klp_disable_patch(struct klp_patch *patch) klp_init_transition(patch, KLP_UNPATCHED); + klp_for_each_object(patch, obj) + if (patch->enabled && obj->patched) + klp_pre_unpatch_callback(obj); + /* * Enforce the order of the func->transition writes in * klp_init_transition() and the TIF_PATCH_PENDING writes in @@ -388,13 +389,18 @@ static int __klp_enable_patch(struct klp_patch *patch) if (!klp_is_object_loaded(obj)) continue; - ret = klp_patch_object(obj); + ret = klp_pre_patch_callback(obj); if (ret) { - pr_warn("failed to enable patch '%s'\n", - patch->mod->name); + pr_warn("pre-patch callback failed for object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } - klp_cancel_transition(); - return ret; + ret = klp_patch_object(obj); + if (ret) { + pr_warn("failed to patch object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; } } @@ -403,6 +409,11 @@ static int __klp_enable_patch(struct klp_patch *patch) patch->enabled = true; return 0; +err: + pr_warn("failed to enable patch '%s'\n", patch->mod->name); + + klp_cancel_transition(); + return ret; } /** @@ -871,13 +882,27 @@ int klp_module_coming(struct module *mod) pr_notice("applying patch '%s' to loading module '%s'\n", patch->mod->name, obj->mod->name); + ret = klp_pre_patch_callback(obj); + if (ret) { + pr_warn("pre-patch callback failed for object '%s'\n", + obj->name); + goto err; + } + ret = klp_patch_object(obj); if (ret) { pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", patch->mod->name, obj->mod->name, ret); + + if (patch != klp_transition_patch) + klp_post_unpatch_callback(obj); + goto err; } + if (patch != klp_transition_patch) + klp_post_patch_callback(obj); + break; } } @@ -927,9 +952,15 @@ void klp_module_going(struct module *mod) * is in transition. */ if (patch->enabled || patch == klp_transition_patch) { + + if (patch != klp_transition_patch) + klp_pre_unpatch_callback(obj); + pr_notice("reverting patch '%s' on unloading module '%s'\n", patch->mod->name, obj->mod->name); klp_unpatch_object(obj); + + klp_post_unpatch_callback(obj); } klp_free_object_loaded(obj); diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index c74f24c47837..6fc907b54e71 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -1,6 +1,44 @@ #ifndef _LIVEPATCH_CORE_H #define _LIVEPATCH_CORE_H +#include + extern struct mutex klp_mutex; +static inline bool klp_is_object_loaded(struct klp_object *obj) +{ + return !obj->name || obj->mod; +} + +static inline int klp_pre_patch_callback(struct klp_object *obj) +{ + int ret; + + ret = (obj->callbacks.pre_patch) ? + (*obj->callbacks.pre_patch)(obj) : 0; + + obj->callbacks.post_unpatch_enabled = !ret; + + return ret; +} + +static inline void klp_post_patch_callback(struct klp_object *obj) +{ + if (obj->callbacks.post_patch) + (*obj->callbacks.post_patch)(obj); +} + +static inline void klp_pre_unpatch_callback(struct klp_object *obj) +{ + if (obj->callbacks.pre_unpatch) + (*obj->callbacks.pre_unpatch)(obj); +} + +static inline void klp_post_unpatch_callback(struct klp_object *obj) +{ + if (obj->callbacks.post_unpatch_enabled && + obj->callbacks.post_unpatch) + (*obj->callbacks.post_unpatch)(obj); +} + #endif /* _LIVEPATCH_CORE_H */ diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 52c4e907c14b..82d584225dc6 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -28,6 +28,7 @@ #include #include #include +#include "core.h" #include "patch.h" #include "transition.h" diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index b004a1fb6032..7bf55b7f3687 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -109,9 +109,6 @@ static void klp_complete_transition(void) } } - if (klp_target_state == KLP_UNPATCHED && !immediate_func) - module_put(klp_transition_patch->mod); - /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ if (klp_target_state == KLP_PATCHED) klp_synchronize_transition(); @@ -130,6 +127,24 @@ static void klp_complete_transition(void) } done: + klp_for_each_object(klp_transition_patch, obj) { + if (!klp_is_object_loaded(obj)) + continue; + if (klp_target_state == KLP_PATCHED) + klp_post_patch_callback(obj); + else if (klp_target_state == KLP_UNPATCHED) + klp_post_unpatch_callback(obj); + } + + /* + * See complementary comment in __klp_enable_patch() for why we + * keep the module reference for immediate patches. + */ + if (!klp_transition_patch->immediate && !immediate_func && + klp_target_state == KLP_UNPATCHED) { + module_put(klp_transition_patch->mod); + } + klp_target_state = KLP_UNDEFINED; klp_transition_patch = NULL; } -- cgit From 6116c3033a761611b1da980ea664c6ddff3eaed6 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 13 Oct 2017 15:08:42 -0400 Subject: livepatch: move transition "complete" notice into klp_complete_transition() klp_complete_transition() performs a bit of housework before a transition to KLP_PATCHED or KLP_UNPATCHED is actually completed (including post-(un)patch callbacks). To be consistent, move the transition "complete" kernel log notice out of klp_try_complete_transition() and into klp_complete_transition(). Suggested-by: Josh Poimboeuf Acked-by: Josh Poimboeuf Signed-off-by: Joe Lawrence Acked-by: Miroslav Benes Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/transition.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 7bf55b7f3687..53887f0bca10 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -136,6 +136,9 @@ done: klp_post_unpatch_callback(obj); } + pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + /* * See complementary comment in __klp_enable_patch() for why we * keep the module reference for immediate patches. @@ -423,9 +426,6 @@ void klp_try_complete_transition(void) } success: - pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, - klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* we're done, now cleanup the data structures */ klp_complete_transition(); } -- cgit From af026796054fb70439e919a925615e61b500ef6b Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 13 Oct 2017 15:08:43 -0400 Subject: livepatch: add transition notices Log a few kernel debug messages at the beginning of the following livepatch transition functions: klp_complete_transition() klp_cancel_transition() klp_init_transition() klp_reverse_transition() Also update the log notice message in klp_start_transition() for similar verbiage as the above messages. Suggested-by: Josh Poimboeuf Signed-off-by: Joe Lawrence Acked-by: Miroslav Benes Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/transition.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 53887f0bca10..56add6327736 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -82,6 +82,10 @@ static void klp_complete_transition(void) unsigned int cpu; bool immediate_func = false; + pr_debug("'%s': completing %s transition\n", + klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + if (klp_target_state == KLP_UNPATCHED) { /* * All tasks have transitioned to KLP_UNPATCHED so we can now @@ -163,6 +167,9 @@ void klp_cancel_transition(void) if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED)) return; + pr_debug("'%s': canceling patching transition, going to unpatch\n", + klp_transition_patch->mod->name); + klp_target_state = KLP_UNPATCHED; klp_complete_transition(); } @@ -441,7 +448,8 @@ void klp_start_transition(void) WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); - pr_notice("'%s': %s...\n", klp_transition_patch->mod->name, + pr_notice("'%s': starting %s transition\n", + klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); /* @@ -497,6 +505,9 @@ void klp_init_transition(struct klp_patch *patch, int state) */ klp_target_state = state; + pr_debug("'%s': initializing %s transition\n", patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + /* * If the patch can be applied or reverted immediately, skip the * per-task transitions. @@ -562,6 +573,11 @@ void klp_reverse_transition(void) unsigned int cpu; struct task_struct *g, *task; + pr_debug("'%s': reversing transition from %s\n", + klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching to unpatching" : + "unpatching to patching"); + klp_transition_patch->enabled = !klp_transition_patch->enabled; klp_target_state = !klp_target_state; -- cgit From 9ad0457423af877ad1b76c105a57130da028ccad Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 6 Oct 2017 16:27:26 +0200 Subject: kernel/module: Delete an error message for a failed memory allocation in add_module_usage() Omit an extra message for a memory allocation failure in this function. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Jessica Yu --- kernel/module.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index de66ec825992..07ef44767245 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -837,10 +837,8 @@ static int add_module_usage(struct module *a, struct module *b) pr_debug("Allocating new usage for %s.\n", a->name); use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) { - pr_warn("%s: out of memory loading\n", a->name); + if (!use) return -ENOMEM; - } use->source = a; use->target = b; -- cgit From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:22 -0700 Subject: bpf: Add file mode configuration into bpf maps Introduce the map read/write flags to the eBPF syscalls that returns the map fd. The flags is used to set up the file mode when construct a new file descriptor for bpf maps. To not break the backward capability, the f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise it should be O_RDONLY or O_WRONLY. When the userspace want to modify or read the map content, it will check the file mode to see if it is allowed to make the change. Signed-off-by: Chenbo Feng Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/arraymap.c | 6 +++- kernel/bpf/devmap.c | 5 ++- kernel/bpf/hashtab.c | 5 +-- kernel/bpf/inode.c | 15 ++++++--- kernel/bpf/lpm_trie.c | 3 +- kernel/bpf/sockmap.c | 5 ++- kernel/bpf/stackmap.c | 5 ++- kernel/bpf/syscall.c | 88 +++++++++++++++++++++++++++++++++++++++++++++------ 8 files changed, 110 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 68d866628be0..988c04c91e10 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -19,6 +19,9 @@ #include "map_in_map.h" +#define ARRAY_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE || + attr->value_size == 0 || + attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || (percpu && numa_node != NUMA_NO_NODE)) return ERR_PTR(-EINVAL); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index e093d9a2c4dd..e5d3de7cff2e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -50,6 +50,9 @@ #include #include +#define DEV_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct bpf_dtab_netdev { struct net_device *dev; struct bpf_dtab *dtab; @@ -80,7 +83,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); dtab = kzalloc(sizeof(*dtab), GFP_USER); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 431126f31ea3..919955236e63 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -18,8 +18,9 @@ #include "bpf_lru_list.h" #include "map_in_map.h" -#define HTAB_CREATE_FLAG_MASK \ - (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE) +#define HTAB_CREATE_FLAG_MASK \ + (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ + BPF_F_RDONLY | BPF_F_WRONLY) struct bucket { struct hlist_nulls_head head; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index be1dde967208..01aaef1a77c5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -295,7 +295,7 @@ out: } static void *bpf_obj_do_get(const struct filename *pathname, - enum bpf_type *type) + enum bpf_type *type, int flags) { struct inode *inode; struct path path; @@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname, return ERR_PTR(ret); inode = d_backing_inode(path.dentry); - ret = inode_permission(inode, MAY_WRITE); + ret = inode_permission(inode, ACC_MODE(flags)); if (ret) goto out; @@ -326,18 +326,23 @@ out: return ERR_PTR(ret); } -int bpf_obj_get_user(const char __user *pathname) +int bpf_obj_get_user(const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; struct filename *pname; int ret = -ENOENT; + int f_flags; void *raw; + f_flags = bpf_get_file_flag(flags); + if (f_flags < 0) + return f_flags; + pname = getname(pathname); if (IS_ERR(pname)) return PTR_ERR(pname); - raw = bpf_obj_do_get(pname, &type); + raw = bpf_obj_do_get(pname, &type, f_flags); if (IS_ERR(raw)) { ret = PTR_ERR(raw); goto out; @@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname) if (type == BPF_TYPE_PROG) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) - ret = bpf_map_new_fd(raw); + ret = bpf_map_new_fd(raw, f_flags); else goto out; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 34d8a690ea05..885e45479680 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -495,7 +495,8 @@ out: #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) -#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE) +#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ + BPF_F_RDONLY | BPF_F_WRONLY) static struct bpf_map *trie_alloc(union bpf_attr *attr) { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index a298d6666698..86ec846f2d5e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -40,6 +40,9 @@ #include #include +#define SOCK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct bpf_stab { struct bpf_map map; struct sock **sock_map; @@ -489,7 +492,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) + attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 135be433e9a0..a15bc636cc98 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -11,6 +11,9 @@ #include #include "percpu_freelist.h" +#define STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; @@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_flags & ~BPF_F_NUMA_NODE) + if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); /* check sanity of attributes */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0e893cac6795..676a06e6b322 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -34,6 +34,8 @@ #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) + DEFINE_PER_CPU(int, bpf_prog_active); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); @@ -294,17 +296,48 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) } #endif +static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, + loff_t *ppos) +{ + /* We need this handler such that alloc_file() enables + * f_mode with FMODE_CAN_READ. + */ + return -EINVAL; +} + +static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, + size_t siz, loff_t *ppos) +{ + /* We need this handler such that alloc_file() enables + * f_mode with FMODE_CAN_WRITE. + */ + return -EINVAL; +} + static const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, #endif .release = bpf_map_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, }; -int bpf_map_new_fd(struct bpf_map *map) +int bpf_map_new_fd(struct bpf_map *map, int flags) { return anon_inode_getfd("bpf-map", &bpf_map_fops, map, - O_RDWR | O_CLOEXEC); + flags | O_CLOEXEC); +} + +int bpf_get_file_flag(int flags) +{ + if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) + return -EINVAL; + if (flags & BPF_F_RDONLY) + return O_RDONLY; + if (flags & BPF_F_WRONLY) + return O_WRONLY; + return O_RDWR; } /* helper macro to check that unused fields 'union bpf_attr' are zero */ @@ -344,12 +377,17 @@ static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_map *map; + int f_flags; int err; err = CHECK_ATTR(BPF_MAP_CREATE); if (err) return -EINVAL; + f_flags = bpf_get_file_flag(attr->map_flags); + if (f_flags < 0) + return f_flags; + if (numa_node != NUMA_NO_NODE && ((unsigned int)numa_node >= nr_node_ids || !node_online(numa_node))) @@ -375,7 +413,7 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map; - err = bpf_map_new_fd(map); + err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. * bpf_map_put() is needed because the above @@ -490,6 +528,11 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -570,6 +613,11 @@ static int map_update_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -659,6 +707,11 @@ static int map_delete_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -702,6 +755,11 @@ static int map_get_next_key(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + if (!(f.file->f_mode & FMODE_CAN_READ)) { + err = -EPERM; + goto err_put; + } + if (ukey) { key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { @@ -908,6 +966,8 @@ static const struct file_operations bpf_prog_fops = { .show_fdinfo = bpf_prog_show_fdinfo, #endif .release = bpf_prog_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, }; int bpf_prog_new_fd(struct bpf_prog *prog) @@ -1117,11 +1177,11 @@ free_prog_nouncharge: return err; } -#define BPF_OBJ_LAST_FIELD bpf_fd +#define BPF_OBJ_LAST_FIELD file_flags static int bpf_obj_pin(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ)) + if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) return -EINVAL; return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); @@ -1129,10 +1189,12 @@ static int bpf_obj_pin(const union bpf_attr *attr) static int bpf_obj_get(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) + if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || + attr->file_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; - return bpf_obj_get_user(u64_to_user_ptr(attr->pathname)); + return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), + attr->file_flags); } #ifdef CONFIG_CGROUP_BPF @@ -1392,20 +1454,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) return fd; } -#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id +#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags static int bpf_map_get_fd_by_id(const union bpf_attr *attr) { struct bpf_map *map; u32 id = attr->map_id; + int f_flags; int fd; - if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID)) + if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || + attr->open_flags & ~BPF_OBJ_FLAG_MASK) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + f_flags = bpf_get_file_flag(attr->open_flags); + if (f_flags < 0) + return f_flags; + spin_lock_bh(&map_idr_lock); map = idr_find(&map_idr, id); if (map) @@ -1417,7 +1485,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - fd = bpf_map_new_fd(map); + fd = bpf_map_new_fd(map, f_flags); if (fd < 0) bpf_map_put(map); -- cgit From afdb09c720b62b8090584c11151d856df330e57d Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:24 -0700 Subject: security: bpf: Add LSM hooks for bpf object related syscall Introduce several LSM hooks for the syscalls that will allow the userspace to access to eBPF object such as eBPF programs and eBPF maps. The security check is aimed to enforce a per object security protection for eBPF object so only processes with the right priviliges can read/write to a specific map or use a specific eBPF program. Besides that, a general security hook is added before the multiplexer of bpf syscall to check the cmd and the attribute used for the command. The actual security module can decide which command need to be checked and how the cmd should be checked. Signed-off-by: Chenbo Feng Acked-by: James Morris Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 676a06e6b322..5cb56d06b48d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -212,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work) struct bpf_map *map = container_of(work, struct bpf_map, work); bpf_map_uncharge_memlock(map); + security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); } @@ -325,6 +326,12 @@ static const struct file_operations bpf_map_fops = { int bpf_map_new_fd(struct bpf_map *map, int flags) { + int ret; + + ret = security_bpf_map(map, OPEN_FMODE(flags)); + if (ret < 0) + return ret; + return anon_inode_getfd("bpf-map", &bpf_map_fops, map, flags | O_CLOEXEC); } @@ -405,10 +412,14 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); - err = bpf_map_charge_memlock(map); + err = security_bpf_map_alloc(map); if (err) goto free_map_nouncharge; + err = bpf_map_charge_memlock(map); + if (err) + goto free_map_sec; + err = bpf_map_alloc_id(map); if (err) goto free_map; @@ -430,6 +441,8 @@ static int map_create(union bpf_attr *attr) free_map: bpf_map_uncharge_memlock(map); +free_map_sec: + security_bpf_map_free(map); free_map_nouncharge: map->ops->map_free(map); return err; @@ -914,6 +927,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) free_used_maps(aux); bpf_prog_uncharge_memlock(aux->prog); + security_bpf_prog_free(aux); bpf_prog_free(aux->prog); } @@ -972,6 +986,12 @@ static const struct file_operations bpf_prog_fops = { int bpf_prog_new_fd(struct bpf_prog *prog) { + int ret; + + ret = security_bpf_prog(prog); + if (ret < 0) + return ret; + return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); } @@ -1111,10 +1131,14 @@ static int bpf_prog_load(union bpf_attr *attr) if (!prog) return -ENOMEM; - err = bpf_prog_charge_memlock(prog); + err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; + err = bpf_prog_charge_memlock(prog); + if (err) + goto free_prog_sec; + prog->len = attr->insn_cnt; err = -EFAULT; @@ -1172,6 +1196,8 @@ free_used_maps: free_used_maps(prog->aux); free_prog: bpf_prog_uncharge_memlock(prog); +free_prog_sec: + security_bpf_prog_free(prog->aux); free_prog_nouncharge: bpf_prog_free(prog); return err; @@ -1640,6 +1666,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; + err = security_bpf(cmd, &attr, size); + if (err < 0) + return err; + switch (cmd) { case BPF_MAP_CREATE: err = map_create(&attr); -- cgit From f66e448cfda021b0bcd884f26709796fe19c7cc1 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 18 Oct 2017 13:00:26 -0700 Subject: selinux: bpf: Add addtional check for bpf object file receive Introduce a bpf object related check when sending and receiving files through unix domain socket as well as binder. It checks if the receiving process have privilege to read/write the bpf map or use the bpf program. This check is necessary because the bpf maps and programs are using a anonymous inode as their shared inode so the normal way of checking the files and sockets when passing between processes cannot work properly on eBPF object. This check only works when the BPF_SYSCALL is configured. Signed-off-by: Chenbo Feng Acked-by: Stephen Smalley Reviewed-by: James Morris Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5cb56d06b48d..323be2473c4b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -315,7 +315,7 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, return -EINVAL; } -static const struct file_operations bpf_map_fops = { +const struct file_operations bpf_map_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_map_show_fdinfo, #endif @@ -975,7 +975,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) } #endif -static const struct file_operations bpf_prog_fops = { +const struct file_operations bpf_prog_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = bpf_prog_show_fdinfo, #endif -- cgit From b5149873a0c299195b5346fe4dc2c5b04ae2f995 Mon Sep 17 00:00:00 2001 From: Tal Shorer Date: Sat, 21 Oct 2017 19:29:24 +0300 Subject: workqueue: respect isolated cpus when queueing an unbound work Initialize wq_unbound_cpumask to exclude cpus that were isolated by the cmdline's isolcpus parameter. Signed-off-by: Tal Shorer Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..bfa433b38a61 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4980,6 +4980,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); @@ -5579,7 +5583,7 @@ int __init workqueue_init_early(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + cpumask_andnot(wq_unbound_cpumask, cpu_possible_mask, cpu_isolated_map); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit From 31749468c3f9d77927ed3144259dc208e6625ede Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 23 Oct 2017 19:39:28 +0200 Subject: bpf: cpumap fix potential lost wake-up problem As pointed out by Michael, commit 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation") contains a classical example of the potential lost wake-up problem. We need to recheck the condition __ptr_ring_empty() after changing current->state to TASK_INTERRUPTIBLE, this avoids a race between wake_up_process() and schedule(). After this, a race with wake_up_process() will simply change the state to TASK_RUNNING, and the schedule() call not really put us to sleep. Fixes: 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation") Reported-by: "Michael S. Tsirkin" Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b4358d84ddf1..86e29cbf7827 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -288,13 +288,17 @@ static int cpu_map_kthread_run(void *data) /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); - sched = 1; + set_current_state(TASK_INTERRUPTIBLE); + /* Recheck to avoid lost wake-up */ + if (__ptr_ring_empty(rcpu->queue)) { + schedule(); + sched = 1; + } else { + __set_current_state(TASK_RUNNING); + } } else { sched = cond_resched(); } - __set_current_state(TASK_RUNNING); /* Process packets in rcpu->queue */ local_bh_disable(); -- cgit From 0b4c6841fee03e096b735074a0c4aab3a8e92986 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 23 Oct 2017 23:53:07 -0700 Subject: bpf: use the same condition in perf event set/free bpf handler This is a cleanup such that doing the same check in perf_event_free_bpf_prog as we already do in perf_event_set_bpf_prog step. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 31ee304a5844..9f78a6825bbe 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8191,10 +8191,10 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; - perf_event_free_bpf_handler(event); - - if (!event->tp_event) + if (event->attr.type != PERF_TYPE_TRACEPOINT) { + perf_event_free_bpf_handler(event); return; + } prog = event->tp_event->prog; if (prog && event->tp_event->bpf_prog_owner == event) { -- cgit From e87c6bc3852b981e71c757be20771546ce9f76f3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 23 Oct 2017 23:53:08 -0700 Subject: bpf: permit multiple bpf attachments for a single perf event This patch enables multiple bpf attachments for a kprobe/uprobe/tracepoint single trace event. Each trace_event keeps a list of attached perf events. When an event happens, all attached bpf programs will be executed based on the order of attachment. A global bpf_event_mutex lock is introduced to protect prog_array attaching and detaching. An alternative will be introduce a mutex lock in every trace_event_call structure, but it takes a lot of extra memory. So a global bpf_event_mutex lock is a good compromise. The bpf prog detachment involves allocation of memory. If the allocation fails, a dummy do-nothing program will replace to-be-detached program in-place. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/core.c | 81 ++++++++++++++++++++++++++++++++++++++++++ kernel/events/core.c | 26 +++++--------- kernel/trace/bpf_trace.c | 82 ++++++++++++++++++++++++++++++++++++++++--- kernel/trace/trace_kprobe.c | 6 ++-- kernel/trace/trace_syscalls.c | 34 ++++++++++-------- kernel/trace/trace_uprobe.c | 3 +- 6 files changed, 188 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8e7c8bf2b687..7fe448799d76 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1394,6 +1394,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); +static unsigned int __bpf_prog_ret1(const void *ctx, + const struct bpf_insn *insn) +{ + return 1; +} + +static struct bpf_prog_dummy { + struct bpf_prog prog; +} dummy_bpf_prog = { + .prog = { + .bpf_func = __bpf_prog_ret1, + }, +}; + /* to avoid allocating empty bpf_prog_array for cgroups that * don't have bpf program attached use one global 'empty_prog_array' * It will not be modified the caller of bpf_prog_array_alloc() @@ -1463,6 +1477,73 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, return 0; } +void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, + struct bpf_prog *old_prog) +{ + struct bpf_prog **prog = progs->progs; + + for (; *prog; prog++) + if (*prog == old_prog) { + WRITE_ONCE(*prog, &dummy_bpf_prog.prog); + break; + } +} + +int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, + struct bpf_prog *exclude_prog, + struct bpf_prog *include_prog, + struct bpf_prog_array **new_array) +{ + int new_prog_cnt, carry_prog_cnt = 0; + struct bpf_prog **existing_prog; + struct bpf_prog_array *array; + int new_prog_idx = 0; + + /* Figure out how many existing progs we need to carry over to + * the new array. + */ + if (old_array) { + existing_prog = old_array->progs; + for (; *existing_prog; existing_prog++) { + if (*existing_prog != exclude_prog && + *existing_prog != &dummy_bpf_prog.prog) + carry_prog_cnt++; + if (*existing_prog == include_prog) + return -EEXIST; + } + } + + /* How many progs (not NULL) will be in the new array? */ + new_prog_cnt = carry_prog_cnt; + if (include_prog) + new_prog_cnt += 1; + + /* Do we have any prog (not NULL) in the new array? */ + if (!new_prog_cnt) { + *new_array = NULL; + return 0; + } + + /* +1 as the end of prog_array is marked with NULL */ + array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); + if (!array) + return -ENOMEM; + + /* Fill in the new prog array */ + if (carry_prog_cnt) { + existing_prog = old_array->progs; + for (; *existing_prog; existing_prog++) + if (*existing_prog != exclude_prog && + *existing_prog != &dummy_bpf_prog.prog) + array->progs[new_prog_idx++] = *existing_prog; + } + if (include_prog) + array->progs[new_prog_idx++] = include_prog; + array->progs[new_prog_idx] = NULL; + *new_array = array; + return 0; +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9f78a6825bbe..9660ee65fbef 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7954,11 +7954,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct pt_regs *regs, struct hlist_head *head, struct task_struct *task) { - struct bpf_prog *prog = call->prog; - - if (prog) { + if (bpf_prog_array_valid(call)) { *(struct pt_regs **)raw_data = regs; - if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) { + if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; } @@ -8147,13 +8145,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint, is_syscall_tp; struct bpf_prog *prog; + int ret; if (event->attr.type != PERF_TYPE_TRACEPOINT) return perf_event_set_bpf_handler(event, prog_fd); - if (event->tp_event->prog) - return -EEXIST; - is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_syscall_tp = is_syscall_trace_event(event->tp_event); @@ -8181,26 +8177,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EACCES; } } - event->tp_event->prog = prog; - event->tp_event->bpf_prog_owner = event; - return 0; + ret = perf_event_attach_bpf_prog(event, prog); + if (ret) + bpf_prog_put(prog); + return ret; } static void perf_event_free_bpf_prog(struct perf_event *event) { - struct bpf_prog *prog; - if (event->attr.type != PERF_TYPE_TRACEPOINT) { perf_event_free_bpf_handler(event); return; } - - prog = event->tp_event->prog; - if (prog && event->tp_event->bpf_prog_owner == event) { - event->tp_event->prog = NULL; - bpf_prog_put(prog); - } + perf_event_detach_bpf_prog(event); } #else diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3126da2f468a..b65011d320e3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,7 +17,7 @@ /** * trace_call_bpf - invoke BPF program - * @prog: BPF program + * @call: tracepoint event * @ctx: opaque context pointer * * kprobe handlers execute BPF programs via this helper. @@ -29,7 +29,7 @@ * 1 - store kprobe event into ring buffer * Other values are reserved and currently alias to 1 */ -unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; @@ -49,9 +49,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) goto out; } - rcu_read_lock(); - ret = BPF_PROG_RUN(prog, ctx); - rcu_read_unlock(); + /* + * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock + * to all call sites, we did a bpf_prog_array_valid() there to check + * whether call->prog_array is empty or not, which is + * a heurisitc to speed up execution. + * + * If bpf_prog_array_valid() fetched prog_array was + * non-NULL, we go into trace_call_bpf() and do the actual + * proper rcu_dereference() under RCU lock. + * If it turns out that prog_array is NULL then, we bail out. + * For the opposite, if the bpf_prog_array_valid() fetched pointer + * was NULL, you'll skip the prog_array with the risk of missing + * out of events when it was updated in between this and the + * rcu_dereference() which is accepted risk. + */ + ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN); out: __this_cpu_dec(bpf_prog_active); @@ -741,3 +754,62 @@ const struct bpf_verifier_ops perf_event_verifier_ops = { const struct bpf_prog_ops perf_event_prog_ops = { }; + +static DEFINE_MUTEX(bpf_event_mutex); + +int perf_event_attach_bpf_prog(struct perf_event *event, + struct bpf_prog *prog) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + int ret = -EEXIST; + + mutex_lock(&bpf_event_mutex); + + if (event->prog) + goto out; + + old_array = rcu_dereference_protected(event->tp_event->prog_array, + lockdep_is_held(&bpf_event_mutex)); + ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + if (ret < 0) + goto out; + + /* set the new array to event->tp_event and set event->prog */ + event->prog = prog; + rcu_assign_pointer(event->tp_event->prog_array, new_array); + bpf_prog_array_free(old_array); + +out: + mutex_unlock(&bpf_event_mutex); + return ret; +} + +void perf_event_detach_bpf_prog(struct perf_event *event) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + int ret; + + mutex_lock(&bpf_event_mutex); + + if (!event->prog) + goto out; + + old_array = rcu_dereference_protected(event->tp_event->prog_array, + lockdep_is_held(&bpf_event_mutex)); + + ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + if (ret < 0) { + bpf_prog_array_delete_safe(old_array, event->prog); + } else { + rcu_assign_pointer(event->tp_event->prog_array, new_array); + bpf_prog_array_free(old_array); + } + + bpf_prog_put(event->prog); + event->prog = NULL; + +out: + mutex_unlock(&bpf_event_mutex); +} diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8a907e12b6b9..abf92e478cfb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1174,13 +1174,12 @@ static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; - struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; head = this_cpu_ptr(call->perf_events); @@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; - struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; head = this_cpu_ptr(call->perf_events); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 696afe72d3b1..71a6af34d7a9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -559,9 +559,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, - struct syscall_metadata *sys_data, - struct syscall_trace_enter *rec) { +static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, + struct syscall_metadata *sys_data, + struct syscall_trace_enter *rec) +{ struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; @@ -573,7 +574,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, param.syscall_nr = rec->nr; for (i = 0; i < sys_data->nb_args; i++) param.args[i] = rec->args[i]; - return trace_call_bpf(prog, ¶m); + return trace_call_bpf(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) @@ -581,7 +582,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct hlist_head *head; - struct bpf_prog *prog; + bool valid_prog_array; int syscall_nr; int rctx; int size; @@ -596,9 +597,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; - prog = READ_ONCE(sys_data->enter_event->prog); head = this_cpu_ptr(sys_data->enter_event->perf_events); - if (!prog && hlist_empty(head)) + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (!valid_prog_array && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ @@ -614,7 +615,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) || + if ((valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; @@ -659,8 +661,9 @@ static void perf_sysenter_disable(struct trace_event_call *call) mutex_unlock(&syscall_trace_lock); } -static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, - struct syscall_trace_exit *rec) { +static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, + struct syscall_trace_exit *rec) +{ struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; @@ -670,7 +673,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; param.ret = rec->ret; - return trace_call_bpf(prog, ¶m); + return trace_call_bpf(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) @@ -678,7 +681,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; struct hlist_head *head; - struct bpf_prog *prog; + bool valid_prog_array; int syscall_nr; int rctx; int size; @@ -693,9 +696,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; - prog = READ_ONCE(sys_data->exit_event->prog); head = this_cpu_ptr(sys_data->exit_event->perf_events); - if (!prog && hlist_empty(head)) + valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); + if (!valid_prog_array && hlist_empty(head)) return; /* We can probably do that at build time */ @@ -709,7 +712,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - if ((prog && !perf_call_bpf_exit(prog, regs, rec)) || + if ((valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4525e0271a53..153c0e411461 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, { struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; - struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); -- cgit From 5aaf1ab55389aeb6ce5527580a1a4d4dbc0f41ff Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 Oct 2017 16:56:50 +0200 Subject: livepatch: Correctly call klp_post_unpatch_callback() in error paths The post_unpatch_enabled flag in struct klp_callbacks is set when a pre-patch callback successfully executes, indicating that we need to call a corresponding post-unpatch callback when the patch is reverted. This is true for ordinary patch disable as well as the error paths of klp_patch_object() callers. As currently coded, we inadvertently execute the post-patch callback twice in klp_module_coming() when klp_patch_object() fails: - We explicitly call klp_post_unpatch_callback() for the failed object - We call it again for the same object (and all the others) via klp_cleanup_module_patches_limited() We should clear the flag in klp_post_unpatch_callback() to make sure that the callback is not called twice. It makes the API more safe. (We could have removed the callback from the former error path as it would be covered by the latter call, but I think that is is cleaner to clear the post_unpatch_enabled after its invoked. For example, someone might later decide to call the callback only when obj->patched flag is set.) There is another mistake in the error path of klp_coming_module() in which it skips the post-unpatch callback for the klp_transition_patch. However, the pre-patch callback was called even for this patch, so be sure to make the corresponding callbacks for all patches. Finally, I used this opportunity to make klp_pre_patch_callback() more readable. [jkosina@suse.cz: incorporate changelog wording changes proposed by Joe Lawrence] Signed-off-by: Petr Mladek Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 4 +--- kernel/livepatch/core.h | 8 +++++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index cafb5a84417d..eb134479c394 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -894,9 +894,7 @@ int klp_module_coming(struct module *mod) pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", patch->mod->name, obj->mod->name, ret); - if (patch != klp_transition_patch) - klp_post_unpatch_callback(obj); - + klp_post_unpatch_callback(obj); goto err; } diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index 6fc907b54e71..cc3aa708e0b4 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -12,10 +12,10 @@ static inline bool klp_is_object_loaded(struct klp_object *obj) static inline int klp_pre_patch_callback(struct klp_object *obj) { - int ret; + int ret = 0; - ret = (obj->callbacks.pre_patch) ? - (*obj->callbacks.pre_patch)(obj) : 0; + if (obj->callbacks.pre_patch) + ret = (*obj->callbacks.pre_patch)(obj); obj->callbacks.post_unpatch_enabled = !ret; @@ -39,6 +39,8 @@ static inline void klp_post_unpatch_callback(struct klp_object *obj) if (obj->callbacks.post_unpatch_enabled && obj->callbacks.post_unpatch) (*obj->callbacks.post_unpatch)(obj); + + obj->callbacks.post_unpatch_enabled = false; } #endif /* _LIVEPATCH_CORE_H */ -- cgit From 89a9a1c1c89cea5f70975c338c011b9f7024dee5 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 20 Oct 2017 16:56:51 +0200 Subject: livepatch: __klp_disable_patch() should never be called for disabled patches __klp_disable_patch() should never be called when the patch is not enabled. Let's add the same warning that we have in __klp_enable_patch(). This allows to remove the check when calling klp_pre_unpatch_callback(). It was strange anyway because it repeatedly checked per-patch flag for each patched object. Signed-off-by: Petr Mladek Acked-by: Joe Lawrence Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb134479c394..287f71e9dbfe 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -282,6 +282,9 @@ static int __klp_disable_patch(struct klp_patch *patch) { struct klp_object *obj; + if (WARN_ON(!patch->enabled)) + return -EINVAL; + if (klp_transition_patch) return -EBUSY; @@ -293,7 +296,7 @@ static int __klp_disable_patch(struct klp_patch *patch) klp_init_transition(patch, KLP_UNPATCHED); klp_for_each_object(patch, obj) - if (patch->enabled && obj->patched) + if (obj->patched) klp_pre_unpatch_callback(obj); /* -- cgit From d41bf8c9deaed1a90b18d3ffc5639d4c19f0259a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Oct 2017 16:18:27 -0700 Subject: cgroup, sched: Move basic cpu stats from cgroup.stat to cpu.stat The basic cpu stat is currently shown with "cpu." prefix in cgroup.stat, and the same information is duplicated in cpu.stat when cpu controller is enabled. This is ugly and not very scalable as we want to expand the coverage of stat information which is always available. This patch makes cgroup core always create "cpu.stat" file and show the basic cpu stat there and calls the cpu controller to show the extra stats when enabled. This ensures that the same information isn't presented in multiple places and makes future expansion of basic stats easier. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra (Intel) --- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup.c | 60 +++++++++++++++++++++++++++++++++++++++-- kernel/cgroup/stat.c | 10 +++---- kernel/sched/core.c | 13 +++------ 4 files changed, 68 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index fa642c99586a..4dc317090920 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -205,6 +205,7 @@ int cgroup_task_count(const struct cgroup *cgrp); void cgroup_stat_flush(struct cgroup *cgrp); int cgroup_stat_init(struct cgroup *cgrp); void cgroup_stat_exit(struct cgroup *cgrp); +void cgroup_stat_show_cputime(struct seq_file *seq); void cgroup_stat_boot(void); /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7975b20f1fd1..d9773e49a1b4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -463,6 +463,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, return &cgrp->self; } +/** + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist + * or is offline, %NULL is returned. + */ +static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + css = cgroup_css(cgrp, ss); + if (!css || !css_tryget_online(css)) + css = NULL; + rcu_read_unlock(); + + return css; +} + /** * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem * @cgrp: the cgroup of interest @@ -3311,11 +3333,40 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); - cgroup_stat_show_cputime(seq, "cpu."); - return 0; } +static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, + struct cgroup *cgrp, int ssid) +{ + struct cgroup_subsys *ss = cgroup_subsys[ssid]; + struct cgroup_subsys_state *css; + int ret; + + if (!ss->css_extra_stat_show) + return 0; + + css = cgroup_tryget_css(cgrp, ss); + if (!css) + return 0; + + ret = ss->css_extra_stat_show(seq, css); + css_put(css); + return ret; +} + +static int cpu_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + int ret = 0; + + cgroup_stat_show_cputime(seq); +#ifdef CONFIG_CGROUP_SCHED + ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); +#endif + return ret; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4423,6 +4474,11 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, + { + .name = "cpu.stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_stat_show, + }, { } /* terminate */ }; diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c index 9cce79e89320..133b465691d6 100644 --- a/kernel/cgroup/stat.c +++ b/kernel/cgroup/stat.c @@ -256,7 +256,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, cgroup_cpu_stat_account_end(cgrp, cstat); } -void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix) +void cgroup_stat_show_cputime(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; @@ -278,10 +278,10 @@ void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix) do_div(utime, NSEC_PER_USEC); do_div(stime, NSEC_PER_USEC); - seq_printf(seq, "%susage_usec %llu\n" - "%suser_usec %llu\n" - "%ssystem_usec %llu\n", - prefix, usage, prefix, utime, prefix, stime); + seq_printf(seq, "usage_usec %llu\n" + "user_usec %llu\n" + "system_usec %llu\n", + usage, utime, stime); } int cgroup_stat_init(struct cgroup *cgrp) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ad255162a830..0b3eec389552 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6678,13 +6678,12 @@ static struct cftype cpu_legacy_files[] = { { } /* Terminate */ }; -static int cpu_stat_show(struct seq_file *sf, void *v) +static int cpu_extra_stat_show(struct seq_file *sf, + struct cgroup_subsys_state *css) { - cgroup_stat_show_cputime(sf, ""); - #ifdef CONFIG_CFS_BANDWIDTH { - struct task_group *tg = css_tg(seq_css(sf)); + struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; u64 throttled_usec; @@ -6817,11 +6816,6 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, #endif static struct cftype cpu_files[] = { - { - .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cpu_stat_show, - }, #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "weight", @@ -6852,6 +6846,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, + .css_extra_stat_show = cpu_extra_stat_show, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, -- cgit From 035226b964c820f65e201cdf123705a8f1d7c670 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Thu, 26 Oct 2017 01:47:42 +0000 Subject: bpf: remove tail_call and get_stackid helper declarations from bpf.h commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related syscall") included linux/bpf.h in linux/security.h. As a result, bpf programs including bpf_helpers.h and some other header that ends up pulling in also security.h, such as several examples under samples/bpf, fail to compile because bpf_tail_call and bpf_get_stackid are now "redefined as different kind of symbol". >From bpf.h: u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); Whereas in bpf_helpers.h they are: static void (*bpf_tail_call)(void *ctx, void *map, int index); static int (*bpf_get_stackid)(void *ctx, void *map, int flags); Fix this by removing the unused declaration of bpf_tail_call and moving the declaration of bpf_get_stackid in bpf_trace.c, which is the only place where it's needed. Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b65011d320e3..136aa6bb0422 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -15,6 +15,8 @@ #include #include "trace.h" +u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + /** * trace_call_bpf - invoke BPF program * @call: tracepoint event -- cgit From bc8293663b953c23ff7b73eb15f82393425e5e47 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Sun, 22 Oct 2017 22:30:55 +0800 Subject: printk: fix typo in printk_safe.c Link: http://lkml.kernel.org/r/1508682655-27293-1-git-send-email-bhe@redhat.com Cc: rostedt@goodmis.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Baoquan He Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk_safe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 3cdaeaef9ce1..89558b85f45c 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -75,7 +75,7 @@ static void queue_flush_work(struct printk_safe_seq_buf *s) * have dedicated buffers, because otherwise printk-safe preempted by * NMI-printk would have overwritten the NMI messages. * - * The messages are fushed from irq work (or from panic()), possibly, + * The messages are flushed from irq work (or from panic()), possibly, * from other CPU, concurrently with printk_safe_log_store(). Should this * happen, printk_safe_log_store() will notice the buffer->len mismatch * and repeat the write. -- cgit From c3ba13298709f46e72b22d087d0aa02bd012e4b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Oct 2017 08:13:14 -0700 Subject: cgroup: mark @cgrp __maybe_unused in cpu_stat_show() The local variable @cgrp isn't used if !CONFIG_CGROUP_SCHED. Mark the variable with __maybe_unused to avoid a compile warning. Reported-by: "kbuild-all@01.org" Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9773e49a1b4..d6ed725f36d9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3357,7 +3357,7 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, static int cpu_stat_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = seq_css(seq)->cgroup; + struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; cgroup_stat_show_cputime(seq); -- cgit From 9afe77ed849de6af8532b4c1b9310102eed9edf7 Mon Sep 17 00:00:00 2001 From: Maxim Akristiniy Date: Mon, 23 Oct 2017 19:51:48 +0300 Subject: added new line symbol after warning about dropped messages so this message will not mess with the next one Cc: Steven Rostedt Signed-off-by: Maxim Akristiniy Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 512f7c2baedd..5d81206a572d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2190,7 +2190,7 @@ again: } if (console_seq < log_first_seq) { - len = sprintf(text, "** %u printk messages dropped ** ", + len = sprintf(text, "** %u printk messages dropped **\n", (unsigned)(log_first_seq - console_seq)); /* messages are gone, move to first one */ -- cgit From ab97f87325e28b7ef7717e6cb62e8da14a7176e1 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Fri, 20 Oct 2017 13:26:02 +0300 Subject: fsnotify: convert fsnotify_mark.refcnt from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable fsnotify_mark.refcnt is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Signed-off-by: Elena Reshetova Signed-off-by: Jan Kara --- kernel/audit_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 011d46e5f73f..45ec960ad536 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1007,7 +1007,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify * We are guaranteed to have at least one reference to the mark from * either the inode or the caller of fsnotify_destroy_mark(). */ - BUG_ON(atomic_read(&entry->refcnt) < 1); + BUG_ON(refcount_read(&entry->refcnt) < 1); } static const struct fsnotify_ops audit_tree_ops = { -- cgit From aa4bf44dc851c6bdd4f7b61b5f2c56c84dfe2ff0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 25 Oct 2017 00:04:40 +0200 Subject: userns: use union in {g,u}idmap struct - Add a struct containing two pointer to extents and wrap both the static extent array and the struct into a union. This is done in preparation for bumping the {g,u}idmap limits for user namespaces. - Add brackets around anonymous union when using designated initializers to initialize members in order to please gcc <= 4.4. Signed-off-by: Christian Brauner Signed-off-by: Eric W. Biederman --- kernel/user.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 00281add65b2..9a20acce460d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -26,26 +26,32 @@ struct user_namespace init_user_ns = { .uid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .gid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .projid_map = { .nr_extents = 1, - .extent[0] = { - .first = 0, - .lower_first = 0, - .count = 4294967295U, + { + .extent[0] = { + .first = 0, + .lower_first = 0, + .count = 4294967295U, + }, }, }, .count = ATOMIC_INIT(3), -- cgit From 6397fac4915ab3002dc15aae751455da1a852f25 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 25 Oct 2017 00:04:41 +0200 Subject: userns: bump idmap limits to 340 There are quite some use cases where users run into the current limit for {g,u}id mappings. Consider a user requesting us to map everything but 999, and 1001 for a given range of 1000000000 with a sub{g,u}id layout of: some-user:100000:1000000000 some-user:999:1 some-user:1000:1 some-user:1001:1 some-user:1002:1 This translates to: MAPPING-TYPE | CONTAINER | HOST | RANGE | -------------|-----------|---------|-----------| uid | 999 | 999 | 1 | uid | 1001 | 1001 | 1 | uid | 0 | 1000000 | 999 | uid | 1000 | 1001000 | 1 | uid | 1002 | 1001002 | 999998998 | ------------------------------------------------ gid | 999 | 999 | 1 | gid | 1001 | 1001 | 1 | gid | 0 | 1000000 | 999 | gid | 1000 | 1001000 | 1 | gid | 1002 | 1001002 | 999998998 | which is already the current limit. As discussed at LPC simply bumping the number of limits is not going to work since this would mean that struct uid_gid_map won't fit into a single cache-line anymore thereby regressing performance for the base-cases. The same problem seems to arise when using a single pointer. So the idea is to use struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; union { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; For the base cases we will only use the struct uid_gid_extent extent member. If we go over UID_GID_MAP_MAX_BASE_EXTENTS mappings we perform a single 4k kmalloc() which means we can have a maximum of 340 mappings (340 * size(struct uid_gid_extent) = 4080). For the latter case we use two pointers "forward" and "reverse". The forward pointer points to an array sorted by "first" and the reverse pointer points to an array sorted by "lower_first". We can then perform binary search on those arrays. Performance Testing: When Eric introduced the extent-based struct uid_gid_map approach he measured the performanc impact of his idmap changes: > My benchmark consisted of going to single user mode where nothing else was > running. On an ext4 filesystem opening 1,000,000 files and looping through all > of the files 1000 times and calling fstat on the individuals files. This was > to ensure I was benchmarking stat times where the inodes were in the kernels > cache, but the inode values were not in the processors cache. My results: > v3.4-rc1: ~= 156ns (unmodified v3.4-rc1 with user namespace support disabled) > v3.4-rc1-userns-: ~= 155ns (v3.4-rc1 with my user namespace patches and user namespace support disabled) > v3.4-rc1-userns+: ~= 164ns (v3.4-rc1 with my user namespace patches and user namespace support enabled) I used an identical approach on my laptop. Here's a thorough description of what I did. I built a 4.14.0-rc4 mainline kernel with my new idmap patches applied. I booted into single user mode and used an ext4 filesystem to open/create 1,000,000 files. Then I looped through all of the files calling fstat() on each of them 1000 times and calculated the mean fstat() time for a single file. (The test program can be found below.) Here are the results. For fun, I compared the first version of my patch which scaled linearly with the new version of the patch: | # MAPPINGS | PATCH-V1 | PATCH-NEW | |--------------|------------|-----------| | 0 mappings | 158 ns | 158 ns | | 1 mappings | 164 ns | 157 ns | | 2 mappings | 170 ns | 158 ns | | 3 mappings | 175 ns | 161 ns | | 5 mappings | 187 ns | 165 ns | | 10 mappings | 218 ns | 199 ns | | 50 mappings | 528 ns | 218 ns | | 100 mappings | 980 ns | 229 ns | | 200 mappings | 1880 ns | 239 ns | | 300 mappings | 2760 ns | 240 ns | | 340 mappings | not tested | 248 ns | Here's the test program I used. I asked Eric what he did and this is a more "advanced" implementation of the idea. It's pretty straight-forward: #define __GNU_SOURCE #define __STDC_FORMAT_MACROS #include #include #include #include #include #include #include #include #include #include #include int main(int argc, char *argv[]) { int ret; size_t i, k; int fd[1000000]; int times[1000]; char pathname[4096]; struct stat st; struct timeval t1, t2; uint64_t time_in_mcs; uint64_t sum = 0; if (argc != 2) { fprintf(stderr, "Please specify a directory where to create " "the test files\n"); exit(EXIT_FAILURE); } for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) { sprintf(pathname, "%s/idmap_test_%zu", argv[1], i); fd[i]= open(pathname, O_RDWR | O_CREAT, S_IXUSR | S_IXGRP | S_IXOTH); if (fd[i] < 0) { ssize_t j; for (j = i; j >= 0; j--) close(fd[j]); exit(EXIT_FAILURE); } } for (k = 0; k < 1000; k++) { ret = gettimeofday(&t1, NULL); if (ret < 0) goto close_all; for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) { ret = fstat(fd[i], &st); if (ret < 0) goto close_all; } ret = gettimeofday(&t2, NULL); if (ret < 0) goto close_all; time_in_mcs = (1000000 * t2.tv_sec + t2.tv_usec) - (1000000 * t1.tv_sec + t1.tv_usec); printf("Total time in micro seconds: %" PRIu64 "\n", time_in_mcs); printf("Total time in nanoseconds: %" PRIu64 "\n", time_in_mcs * 1000); printf("Time per file in nanoseconds: %" PRIu64 "\n", (time_in_mcs * 1000) / 1000000); times[k] = (time_in_mcs * 1000) / 1000000; } close_all: for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) close(fd[i]); if (ret < 0) exit(EXIT_FAILURE); for (k = 0; k < 1000; k++) { sum += times[k]; } printf("Mean time per file in nanoseconds: %" PRIu64 "\n", sum / 1000); exit(EXIT_SUCCESS);; } Signed-off-by: Christian Brauner CC: Serge Hallyn CC: Eric Biederman Signed-off-by: Eric W. Biederman --- kernel/user_namespace.c | 350 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 320 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..5fd2d53dbc75 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); @@ -181,6 +183,18 @@ static void free_user_ns(struct work_struct *work) do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; + if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->gid_map.forward); + kfree(ns->gid_map.reverse); + } + if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->uid_map.forward); + kfree(ns->uid_map.reverse); + } + if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(ns->projid_map.forward); + kfree(ns->projid_map.reverse); + } retire_userns_sysctls(ns); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); @@ -198,7 +212,84 @@ void __put_user_ns(struct user_namespace *ns) } EXPORT_SYMBOL(__put_user_ns); -static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +/** + * idmap_key struct holds the information necessary to find an idmapping in a + * sorted idmap array. It is passed to cmp_map_id() as first argument. + */ +struct idmap_key { + bool map_up; /* true -> id from kid; false -> kid from id */ + u32 id; /* id to find */ + u32 count; /* == 0 unless used with map_id_range_down() */ +}; + +/** + * cmp_map_id - Function to be passed to bsearch() to find the requested + * idmapping. Expects struct idmap_key to be passed via @k. + */ +static int cmp_map_id(const void *k, const void *e) +{ + u32 first, last, id2; + const struct idmap_key *key = k; + const struct uid_gid_extent *el = e; + + /* handle map_id_range_down() */ + if (key->count) + id2 = key->id + key->count - 1; + else + id2 = key->id; + + /* handle map_id_{down,up}() */ + if (key->map_up) + first = el->lower_first; + else + first = el->first; + + last = first + el->count - 1; + + if (key->id >= first && key->id <= last && + (id2 >= first && id2 <= last)) + return 0; + + if (key->id < first || id2 < first) + return -1; + + return 1; +} + +/** + * map_id_range_down_max - Find idmap via binary search in ordered idmap array. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count) +{ + u32 extents; + struct uid_gid_extent *extent; + struct idmap_key key; + + key.map_up = false; + key.count = count; + key.id = id; + + extents = map->nr_extents; + smp_rmb(); + + extent = bsearch(&key, map->forward, extents, + sizeof(struct uid_gid_extent), cmp_map_id); + /* Map the id or note failure */ + if (extent) + id = (id - extent->first) + extent->lower_first; + else + id = (u32) -1; + + return id; +} + +/** + * map_id_range_down_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count) { unsigned idx, extents; u32 first, last, id2; @@ -224,7 +315,23 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -static u32 map_id_down(struct uid_gid_map *map, u32 id) +static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_range_down_base(map, id, count); + + return map_id_range_down_max(map, id, count); +} + +/** + * map_id_down_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_down_base(struct uid_gid_map *map, u32 id) { unsigned idx, extents; u32 first, last; @@ -247,7 +354,23 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) return id; } -static u32 map_id_up(struct uid_gid_map *map, u32 id) +static u32 map_id_down(struct uid_gid_map *map, u32 id) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_down_base(map, id); + + return map_id_range_down_max(map, id, 0); +} + +/** + * map_id_up_base - Find idmap via binary search in static extent array. + * Can only be called if number of mappings is equal or less than + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_up_base(struct uid_gid_map *map, u32 id) { unsigned idx, extents; u32 first, last; @@ -270,6 +393,45 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) return id; } +/** + * map_id_up_max - Find idmap via binary search in ordered idmap array. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static u32 map_id_up_max(struct uid_gid_map *map, u32 id) +{ + u32 extents; + struct uid_gid_extent *extent; + struct idmap_key key; + + key.map_up = true; + key.count = 0; + key.id = id; + + extents = map->nr_extents; + smp_rmb(); + + extent = bsearch(&key, map->reverse, extents, + sizeof(struct uid_gid_extent), cmp_map_id); + /* Map the id or note failure */ + if (extent) + id = (id - extent->lower_first) + extent->first; + else + id = (u32) -1; + + return id; +} + +static u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + u32 extents = map->nr_extents; + smp_rmb(); + + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return map_id_up_base(map, id); + + return map_id_up_max(map, id); +} + /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in @@ -540,13 +702,15 @@ static int projid_m_show(struct seq_file *seq, void *v) static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { - struct uid_gid_extent *extent = NULL; loff_t pos = *ppos; - if (pos < map->nr_extents) - extent = &map->extent[pos]; + if (pos >= map->nr_extents) + return NULL; + + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return &map->extent[pos]; - return extent; + return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) @@ -618,7 +782,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map, u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; - prev = &new_map->extent[idx]; + if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + prev = &new_map->extent[idx]; + else + prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; @@ -638,6 +805,102 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } +/** + * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. + * Takes care to allocate a 4K block of memory if the number of mappings exceeds + * UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) +{ + if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) { + map->extent[map->nr_extents].first = extent->first; + map->extent[map->nr_extents].lower_first = extent->lower_first; + map->extent[map->nr_extents].count = extent->count; + return 0; + } + + if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { + struct uid_gid_extent *forward; + + /* Allocate memory for 340 mappings. */ + forward = kmalloc(sizeof(struct uid_gid_extent) * + UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL); + if (!forward) + return -ENOMEM; + + /* Copy over memory. Only set up memory for the forward pointer. + * Defer the memory setup for the reverse pointer. + */ + memcpy(forward, map->extent, + map->nr_extents * sizeof(map->extent[0])); + + map->forward = forward; + map->reverse = NULL; + } + + map->forward[map->nr_extents].first = extent->first; + map->forward[map->nr_extents].lower_first = extent->lower_first; + map->forward[map->nr_extents].count = extent->count; + return 0; +} + +/* cmp function to sort() forward mappings */ +static int cmp_extents_forward(const void *a, const void *b) +{ + const struct uid_gid_extent *e1 = a; + const struct uid_gid_extent *e2 = b; + + if (e1->first < e2->first) + return -1; + + if (e1->first > e2->first) + return 1; + + return 0; +} + +/* cmp function to sort() reverse mappings */ +static int cmp_extents_reverse(const void *a, const void *b) +{ + const struct uid_gid_extent *e1 = a; + const struct uid_gid_extent *e2 = b; + + if (e1->lower_first < e2->lower_first) + return -1; + + if (e1->lower_first > e2->lower_first) + return 1; + + return 0; +} + +/** + * sort_idmaps - Sorts an array of idmap entries. + * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. + */ +static int sort_idmaps(struct uid_gid_map *map) +{ + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + return 0; + + /* Sort forward array. */ + sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), + cmp_extents_forward, NULL); + + /* Only copy the memory from forward we actually need. */ + map->reverse = kmemdup(map->forward, + map->nr_extents * sizeof(struct uid_gid_extent), + GFP_KERNEL); + if (!map->reverse) + return -ENOMEM; + + /* Sort reverse array. */ + sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), + cmp_extents_reverse, NULL); + + return 0; +} + static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -648,7 +911,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct user_namespace *ns = seq->private; struct uid_gid_map new_map; unsigned idx; - struct uid_gid_extent *extent = NULL; + struct uid_gid_extent extent; char *kbuf = NULL, *pos, *next_line; ssize_t ret = -EINVAL; @@ -673,6 +936,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, */ mutex_lock(&userns_state_mutex); + memset(&new_map, 0, sizeof(struct uid_gid_map)); + ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) @@ -700,9 +965,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, /* Parse the user data */ ret = -EINVAL; pos = kbuf; - new_map.nr_extents = 0; for (; pos; pos = next_line) { - extent = &new_map.extent[new_map.nr_extents]; /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); @@ -714,17 +977,17 @@ static ssize_t map_write(struct file *file, const char __user *buf, } pos = skip_spaces(pos); - extent->first = simple_strtoul(pos, &pos, 10); + extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); - extent->lower_first = simple_strtoul(pos, &pos, 10); + extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); - extent->count = simple_strtoul(pos, &pos, 10); + extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; @@ -734,29 +997,33 @@ static ssize_t map_write(struct file *file, const char __user *buf, goto out; /* Verify we have been given valid starting values */ - if ((extent->first == (u32) -1) || - (extent->lower_first == (u32) -1)) + if ((extent.first == (u32) -1) || + (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ - if ((extent->first + extent->count) <= extent->first) + if ((extent.first + extent.count) <= extent.first) goto out; - if ((extent->lower_first + extent->count) <= - extent->lower_first) + if ((extent.lower_first + extent.count) <= + extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ - if (mappings_overlap(&new_map, extent)) + if (mappings_overlap(&new_map, &extent)) goto out; - new_map.nr_extents++; - - /* Fail if the file contains too many extents */ - if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && + if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; + + ret = insert_extent(&new_map, &extent); + if (ret < 0) + goto out; + ret = -EINVAL; + + new_map.nr_extents++; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0) @@ -767,16 +1034,26 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; + ret = sort_idmaps(&new_map); + if (ret < 0) + goto out; + + ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { + struct uid_gid_extent *e; u32 lower_first; - extent = &new_map.extent[idx]; + + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + e = &new_map.extent[idx]; + else + e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, - extent->lower_first, - extent->count); + e->lower_first, + e->count); /* Fail if we can not map the specified extent to * the kernel global id space. @@ -784,18 +1061,31 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (lower_first == (u32) -1) goto out; - extent->lower_first = lower_first; + e->lower_first = lower_first; } /* Install the map */ - memcpy(map->extent, new_map.extent, - new_map.nr_extents*sizeof(new_map.extent[0])); + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + memcpy(map->extent, new_map.extent, + new_map.nr_extents * sizeof(new_map.extent[0])); + } else { + map->forward = new_map.forward; + map->reverse = new_map.reverse; + } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: + if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { + kfree(new_map.forward); + kfree(new_map.reverse); + map->forward = NULL; + map->reverse = NULL; + map->nr_extents = 0; + } + mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; -- cgit From 11a8b9270e16e36d5fb607ba4b60db2958b7c625 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 15:54:32 -0500 Subject: userns: Don't special case a count of 0 We can always use a count of 1 so there is no reason to have a special case of a count of 0. Signed-off-by: Eric W. Biederman --- kernel/user_namespace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 5fd2d53dbc75..c9904ee084c4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -232,11 +232,7 @@ static int cmp_map_id(const void *k, const void *e) const struct idmap_key *key = k; const struct uid_gid_extent *el = e; - /* handle map_id_range_down() */ - if (key->count) - id2 = key->id + key->count - 1; - else - id2 = key->id; + id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) @@ -362,7 +358,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return map_id_down_base(map, id); - return map_id_range_down_max(map, id, 0); + return map_id_range_down_max(map, id, 1); } /** @@ -404,7 +400,7 @@ static u32 map_id_up_max(struct uid_gid_map *map, u32 id) struct idmap_key key; key.map_up = true; - key.count = 0; + key.count = 1; key.id = id; extents = map->nr_extents; -- cgit From 3edf652fa16562fb57a5a4b996ba72e2d7cdc38b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 16:27:29 -0500 Subject: userns: Simplify the user and group mapping functions Consolidate reading the number of extents and computing the return value in the map_id_down, map_id_range_down and map_id_range. This removal of one read of extents makes one smp_rmb unnecessary and makes the code safe it is executed during the map write. Reading the number of extents twice and depending on the result being the same is not safe, as it could be 0 the first time and > 5 the second time, which would lead to misinterpreting the union fields. The consolidation of the return value just removes a duplicate caluculation which should make it easier to understand and maintain the code. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 132 +++++++++++++++++++++--------------------------- 1 file changed, 58 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c9904ee084c4..563a2981d7c7 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -256,28 +256,17 @@ static int cmp_map_id(const void *k, const void *e) * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count) +static struct uid_gid_extent * +map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { - u32 extents; - struct uid_gid_extent *extent; struct idmap_key key; key.map_up = false; key.count = count; key.id = id; - extents = map->nr_extents; - smp_rmb(); - - extent = bsearch(&key, map->forward, extents, - sizeof(struct uid_gid_extent), cmp_map_id); - /* Map the id or note failure */ - if (extent) - id = (id - extent->first) + extent->lower_first; - else - id = (u32) -1; - - return id; + return bsearch(&key, map->forward, extents, + sizeof(struct uid_gid_extent), cmp_map_id); } /** @@ -285,41 +274,43 @@ static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count) * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count) +static struct uid_gid_extent * +map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { - unsigned idx, extents; + unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) - break; + return &map->extent[idx]; } - /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].lower_first; - else - id = (u32) -1; - - return id; + return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { - u32 extents = map->nr_extents; + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - return map_id_range_down_base(map, id, count); + extent = map_id_range_down_base(extents, map, id, count); + else + extent = map_id_range_down_max(extents, map, id, count); - return map_id_range_down_max(map, id, count); + /* Map the id or note failure */ + if (extent) + id = (id - extent->first) + extent->lower_first; + else + id = (u32) -1; + + return id; } /** @@ -327,38 +318,40 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_down_base(struct uid_gid_map *map, u32 id) +static struct uid_gid_extent * +map_id_down_base(unsigned extents, struct uid_gid_map *map, u32 id) { - unsigned idx, extents; + unsigned idx; u32 first, last; /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) - break; + return &map->extent[idx]; } - /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].lower_first; - else - id = (u32) -1; - - return id; + return NULL; } static u32 map_id_down(struct uid_gid_map *map, u32 id) { - u32 extents = map->nr_extents; + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - return map_id_down_base(map, id); + extent = map_id_down_base(extents, map, id); + else + extent = map_id_range_down_max(extents, map, id, 1); - return map_id_range_down_max(map, id, 1); + /* Map the id or note failure */ + if (extent) + id = (id - extent->first) + extent->lower_first; + else + id = (u32) -1; + + return id; } /** @@ -366,48 +359,50 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_up_base(struct uid_gid_map *map, u32 id) +static struct uid_gid_extent * +map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { - unsigned idx, extents; + unsigned idx; u32 first, last; /* Find the matching extent */ - extents = map->nr_extents; - smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) - break; + return &map->extent[idx]; } - /* Map the id or note failure */ - if (idx < extents) - id = (id - first) + map->extent[idx].first; - else - id = (u32) -1; - - return id; + return NULL; } /** * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ -static u32 map_id_up_max(struct uid_gid_map *map, u32 id) +static struct uid_gid_extent * +map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { - u32 extents; - struct uid_gid_extent *extent; struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; - extents = map->nr_extents; + return bsearch(&key, map->reverse, extents, + sizeof(struct uid_gid_extent), cmp_map_id); +} + +static u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + struct uid_gid_extent *extent; + unsigned extents = map->nr_extents; smp_rmb(); - extent = bsearch(&key, map->reverse, extents, - sizeof(struct uid_gid_extent), cmp_map_id); + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent = map_id_up_base(extents, map, id); + else + extent = map_id_up_max(extents, map, id); + /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; @@ -417,17 +412,6 @@ static u32 map_id_up_max(struct uid_gid_map *map, u32 id) return id; } -static u32 map_id_up(struct uid_gid_map *map, u32 id) -{ - u32 extents = map->nr_extents; - smp_rmb(); - - if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - return map_id_up_base(map, id); - - return map_id_up_max(map, id); -} - /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in -- cgit From d5e7b3c5f51fc6d34e12b6d87bfd30ab277c4625 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 17:09:34 -0500 Subject: userns: Don't read extents twice in m_start This is important so reading /proc//{uid_map,gid_map,projid_map} while the map is being written does not do strange things. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 563a2981d7c7..4f7e357ac1e2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -683,11 +683,13 @@ static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; + unsigned extents = map->nr_extents; + smp_rmb(); - if (pos >= map->nr_extents) + if (pos >= extents) return NULL; - if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; -- cgit From ece66133979b211324cc6aff9285889b425243d2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 16:53:09 -0500 Subject: userns: Make map_id_down a wrapper for map_id_range_down There is no good reason for this code duplication, the number of cache line accesses not the number of instructions are the bottleneck in this code. Therefore simplify maintenance by removing unnecessary code. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 4f7e357ac1e2..1d0298870ee3 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -313,45 +313,9 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) return id; } -/** - * map_id_down_base - Find idmap via binary search in static extent array. - * Can only be called if number of mappings is equal or less than - * UID_GID_MAP_MAX_BASE_EXTENTS. - */ -static struct uid_gid_extent * -map_id_down_base(unsigned extents, struct uid_gid_map *map, u32 id) -{ - unsigned idx; - u32 first, last; - - /* Find the matching extent */ - for (idx = 0; idx < extents; idx++) { - first = map->extent[idx].first; - last = first + map->extent[idx].count - 1; - if (id >= first && id <= last) - return &map->extent[idx]; - } - return NULL; -} - static u32 map_id_down(struct uid_gid_map *map, u32 id) { - struct uid_gid_extent *extent; - unsigned extents = map->nr_extents; - smp_rmb(); - - if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) - extent = map_id_down_base(extents, map, id); - else - extent = map_id_range_down_max(extents, map, id, 1); - - /* Map the id or note failure */ - if (extent) - id = (id - extent->first) + extent->lower_first; - else - id = (u32) -1; - - return id; + return map_id_range_down(map, id, 1); } /** -- cgit From 3fda0e737e906ce73220b20c27e7f792d0aac6a8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 31 Oct 2017 17:15:30 -0500 Subject: userns: Simplify insert_extent Consolidate the code to write to the new mapping at the end of the function to remove the duplication. Move the increase in the number of mappings into insert_extent, keeping the logic together. Just a small increase in readability and maintainability. Signed-off-by: "Eric W. Biederman" --- kernel/user_namespace.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 1d0298870ee3..899c31060ff3 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -758,12 +758,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map, */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { - if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) { - map->extent[map->nr_extents].first = extent->first; - map->extent[map->nr_extents].lower_first = extent->lower_first; - map->extent[map->nr_extents].count = extent->count; - return 0; - } + struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; @@ -784,9 +779,13 @@ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) map->reverse = NULL; } - map->forward[map->nr_extents].first = extent->first; - map->forward[map->nr_extents].lower_first = extent->lower_first; - map->forward[map->nr_extents].count = extent->count; + if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) + dest = &map->extent[map->nr_extents]; + else + dest = &map->forward[map->nr_extents]; + + *dest = *extent; + map->nr_extents++; return 0; } @@ -968,8 +967,6 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (ret < 0) goto out; ret = -EINVAL; - - new_map.nr_extents++; } /* Be very certaint the new map actually exists */ if (new_map.nr_extents == 0) -- cgit From 638f5b90d46016372a8e3e0a434f199cc5e12b8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 31 Oct 2017 18:16:05 -0700 Subject: bpf: reduce verifier memory consumption the verifier got progressively smarter over time and size of its internal state grew as well. Time to reduce the memory consumption. Before: sizeof(struct bpf_verifier_state) = 6520 After: sizeof(struct bpf_verifier_state) = 896 It's done by observing that majority of BPF programs use little to no stack whereas verifier kept all of 512 stack slots ready always. Instead dynamically reallocate struct verifier state when stack access is detected. Runtime difference before vs after is within a noise. The number of processed instructions stays the same. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 437 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 288 insertions(+), 149 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d906775e12c1..5f26f7ad124f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -276,43 +276,132 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, ")"); } } - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] == STACK_SPILL) - verbose(env, " fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] == STACK_SPILL) + verbose(env, " fp%d=%s", + -MAX_BPF_STACK + i * BPF_REG_SIZE, + reg_type_str[state->stack[i].spilled_ptr.type]); } verbose(env, "\n"); } -static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) +static int copy_stack_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) { - struct bpf_verifier_stack_elem *elem; - int insn_idx; + if (!src->stack) + return 0; + if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { + /* internal bug, make state invalid to reject the program */ + memset(dst, 0, sizeof(*dst)); + return -EFAULT; + } + memcpy(dst->stack, src->stack, + sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); + return 0; +} + +/* do_check() starts with zero-sized stack in struct bpf_verifier_state to + * make it consume minimal amount of memory. check_stack_write() access from + * the program calls into realloc_verifier_state() to grow the stack size. + * Note there is a non-zero 'parent' pointer inside bpf_verifier_state + * which this function copies over. It points to previous bpf_verifier_state + * which is never reallocated + */ +static int realloc_verifier_state(struct bpf_verifier_state *state, int size, + bool copy_old) +{ + u32 old_size = state->allocated_stack; + struct bpf_stack_state *new_stack; + int slot = size / BPF_REG_SIZE; + + if (size <= old_size || !size) { + if (copy_old) + return 0; + state->allocated_stack = slot * BPF_REG_SIZE; + if (!size && old_size) { + kfree(state->stack); + state->stack = NULL; + } + return 0; + } + new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), + GFP_KERNEL); + if (!new_stack) + return -ENOMEM; + if (copy_old) { + if (state->stack) + memcpy(new_stack, state->stack, + sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); + memset(new_stack + old_size / BPF_REG_SIZE, 0, + sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); + } + state->allocated_stack = slot * BPF_REG_SIZE; + kfree(state->stack); + state->stack = new_stack; + return 0; +} + +static void free_verifier_state(struct bpf_verifier_state *state) +{ + kfree(state->stack); + kfree(state); +} + +/* copy verifier state from src to dst growing dst stack space + * when necessary to accommodate larger src stack + */ +static int copy_verifier_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) +{ + int err; + + err = realloc_verifier_state(dst, src->allocated_stack, false); + if (err) + return err; + memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + return copy_stack_state(dst, src); +} + +static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, + int *insn_idx) +{ + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_stack_elem *elem, *head = env->head; + int err; if (env->head == NULL) - return -1; + return -ENOENT; - memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); - insn_idx = env->head->insn_idx; + if (cur) { + err = copy_verifier_state(cur, &head->st); + if (err) + return err; + } + if (insn_idx) + *insn_idx = head->insn_idx; if (prev_insn_idx) - *prev_insn_idx = env->head->prev_insn_idx; - elem = env->head->next; - kfree(env->head); + *prev_insn_idx = head->prev_insn_idx; + elem = head->next; + kfree(head); env->head = elem; env->stack_size--; - return insn_idx; + return 0; } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { + struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; + int err; - elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); if (!elem) goto err; - memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); + err = copy_verifier_state(&elem->st, cur); + if (err) + return NULL; elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; @@ -325,7 +414,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, return &elem->st; err: /* pop all elements and return */ - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); return NULL; } @@ -550,7 +639,7 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -563,7 +652,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(&env->cur_state, regno); + mark_reg_read(env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -601,10 +690,21 @@ static int check_stack_write(struct bpf_verifier_env *env, struct bpf_verifier_state *state, int off, int size, int value_regno) { - int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; + int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + + err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); + if (err) + return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits */ + if (!env->allow_ptr_leaks && + state->stack[spi].slot_type[0] == STACK_SPILL && + size != BPF_REG_SIZE) { + verbose(env, "attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } if (value_regno >= 0 && is_spillable_regtype(state->regs[value_regno].type)) { @@ -616,17 +716,18 @@ static int check_stack_write(struct bpf_verifier_env *env, } /* save register state */ - state->spilled_regs[spi] = state->regs[value_regno]; - state->spilled_regs[spi].live |= REG_LIVE_WRITTEN; + state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) - state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + state->stack[spi].slot_type[i] = STACK_SPILL; } else { /* regular write of data into stack */ - state->spilled_regs[spi] = (struct bpf_reg_state) {}; + state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; for (i = 0; i < size; i++) - state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; + state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = + STACK_MISC; } return 0; } @@ -637,10 +738,10 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN) + if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->spilled_regs[slot].live |= REG_LIVE_READ; + parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; } @@ -650,34 +751,37 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_verifier_state *state, int off, int size, int value_regno) { - u8 *slot_type; - int i, spi; + int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; + u8 *stype; - slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; + if (state->allocated_stack <= slot) { + verbose(env, "invalid read from stack off %d+0 size %d\n", + off, size); + return -EACCES; + } + stype = state->stack[spi].slot_type; - if (slot_type[0] == STACK_SPILL) { + if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { verbose(env, "invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { - if (slot_type[i] != STACK_SPILL) { + if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { verbose(env, "corrupted spill memory\n"); return -EACCES; } } - spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; - if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->spilled_regs[spi]; + state->regs[value_regno] = state->stack[spi].spilled_ptr; mark_stack_slot_read(state, spi); } return 0; } else { for (i = 0; i < size; i++) { - if (slot_type[i] != STACK_MISC) { + if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; @@ -694,7 +798,8 @@ static int check_stack_read(struct bpf_verifier_env *env, static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; if (off < 0 || size <= 0 || off + size > map->value_size) { verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", @@ -706,9 +811,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -783,7 +888,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; if (off < 0 || size <= 0 || (u64)off + size > reg->range) { @@ -797,7 +902,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; int err; @@ -866,7 +971,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks, static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { - return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); + return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, @@ -968,8 +1073,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn int bpf_size, enum bpf_access_type t, int value_regno) { - struct bpf_verifier_state *state = &env->cur_state; - struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_verifier_state *state = env->cur_state; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -993,7 +1099,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, state->regs, value_regno); + mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; @@ -1028,14 +1134,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) - mark_reg_unknown(env, state->regs, value_regno); + mark_reg_unknown(env, regs, value_regno); else - mark_reg_known_zero(env, state->regs, + mark_reg_known_zero(env, regs, value_regno); - state->regs[value_regno].id = 0; - state->regs[value_regno].off = 0; - state->regs[value_regno].range = 0; - state->regs[value_regno].type = reg_type; + regs[value_regno].id = 0; + regs[value_regno].off = 0; + regs[value_regno].range = 0; + regs[value_regno].type = reg_type; } } else if (reg->type == PTR_TO_STACK) { @@ -1061,19 +1167,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (env->prog->aux->stack_depth < -off) env->prog->aux->stack_depth = -off; - if (t == BPF_WRITE) { - if (!env->allow_ptr_leaks && - state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && - size != BPF_REG_SIZE) { - verbose(env, "attempt to corrupt spilled pointer on stack\n"); - return -EACCES; - } + if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, value_regno); - } else { + else err = check_stack_read(env, state, off, size, value_regno); - } } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose(env, "cannot write into packet\n"); @@ -1087,7 +1186,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(env, state->regs, value_regno); + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -1095,11 +1194,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && - state->regs[value_regno].type == SCALAR_VALUE) { + regs[value_regno].type == SCALAR_VALUE) { /* b/h/w load zero-extends, mark upper bits as known 0 */ - state->regs[value_regno].var_off = tnum_cast( - state->regs[value_regno].var_off, size); - __update_reg_bounds(&state->regs[value_regno]); + regs[value_regno].var_off = + tnum_cast(regs[value_regno].var_off, size); + __update_reg_bounds(®s[value_regno]); } return err; } @@ -1156,9 +1255,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs; - int off, i; + int off, i, slot, spi; if (regs[regno].type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -1198,7 +1297,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } for (i = 0; i < access_size; i++) { - if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { + slot = -(off + i) - 1; + spi = slot / BPF_REG_SIZE; + if (state->allocated_stack <= slot || + state->stack[spi].slot_type[slot % BPF_REG_SIZE] != + STACK_MISC) { verbose(env, "invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; @@ -1211,7 +1314,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; switch (reg->type) { case PTR_TO_PACKET: @@ -1229,7 +1332,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; int err = 0; @@ -1514,7 +1617,7 @@ static int check_raw_mode(const struct bpf_func_proto *fn) */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -1522,10 +1625,10 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) if (reg_is_pkt_pointer_any(®s[i])) mark_reg_unknown(env, regs, i); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - reg = &state->spilled_regs[i / BPF_REG_SIZE]; + reg = &state->stack[i].spilled_ptr; if (reg_is_pkt_pointer_any(reg)) __mark_reg_unknown(reg); } @@ -1533,9 +1636,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { - struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1603,6 +1705,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return err; } + regs = cur_regs(env); /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -1691,7 +1794,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@ -1703,13 +1806,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, &env->cur_state); + print_verifier_state(env, env->cur_state); verbose(env, "verifier internal error: known but bad sbounds\n"); return -EINVAL; } if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, &env->cur_state); + print_verifier_state(env, env->cur_state); verbose(env, "verifier internal error: known but bad ubounds\n"); return -EINVAL; @@ -1890,7 +1993,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg, struct bpf_reg_state src_reg) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); bool src_known, dst_known; s64 smin_val, smax_val; @@ -2111,7 +2214,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); int rc; @@ -2185,12 +2288,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, &env->cur_state); + print_verifier_state(env, env->cur_state); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, &env->cur_state); + print_verifier_state(env, env->cur_state); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -2200,7 +2303,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); int err; @@ -2421,10 +2524,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - reg = &state->spilled_regs[i / BPF_REG_SIZE]; + reg = &state->stack[i].spilled_ptr; if (reg->type == type && reg->id == dst_reg->id) reg->range = max_t(u16, reg->range, new_range); } @@ -2674,17 +2777,17 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null); + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); } } static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; + struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; struct bpf_reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -2876,7 +2979,7 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -2937,7 +3040,7 @@ static bool may_access_skb(enum bpf_prog_type type) */ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 mode = BPF_MODE(insn->code); int i, err; @@ -2999,7 +3102,7 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; } - reg = &env->cur_state.regs[BPF_REG_0]; + reg = cur_regs(env) + BPF_REG_0; if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); @@ -3363,6 +3466,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } +static bool stacksafe(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, + struct idpair *idmap) +{ + int i, spi; + + /* if explored stack has more populated slots than current stack + * such stacks are not equivalent + */ + if (old->allocated_stack > cur->allocated_stack) + return false; + + /* walk slots of the explored stack and ignore any additional + * slots in the current stack, since explored(safe) state + * didn't use them + */ + for (i = 0; i < old->allocated_stack; i++) { + spi = i / BPF_REG_SIZE; + + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) + continue; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ + return false; + if (i % BPF_REG_SIZE) + continue; + if (old->stack[spi].slot_type[0] != STACK_SPILL) + continue; + if (!regsafe(&old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, + idmap)) + /* when explored and current stack slot are both storing + * spilled registers, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} + * but current path has stored: + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + return false; + } + return true; +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -3407,37 +3561,8 @@ static bool states_equal(struct bpf_verifier_env *env, goto out_free; } - for (i = 0; i < MAX_BPF_STACK; i++) { - if (old->stack_slot_type[i] == STACK_INVALID) - continue; - if (old->stack_slot_type[i] != cur->stack_slot_type[i]) - /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has has STACK_MISC -> - * this verifier states are not equivalent, - * return false to continue verification of this path - */ - goto out_free; - if (i % BPF_REG_SIZE) - continue; - if (old->stack_slot_type[i] != STACK_SPILL) - continue; - if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE], - &cur->spilled_regs[i / BPF_REG_SIZE], - idmap)) - /* when explored and current stack slot are both storing - * spilled registers, check that stored pointers types - * are the same as well. - * Ex: explored safe path could have stored - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} - * but current path has stored: - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} - * such verifier states are not equivalent. - * return false to continue verification of this path - */ - goto out_free; - else - continue; - } + if (!stacksafe(old, cur, idmap)) + goto out_free; ret = true; out_free: kfree(idmap); @@ -3473,17 +3598,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state, } } /* ... and stack slots */ - for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) { - if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].slot_type[0] != STACK_SPILL) continue; - if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - if (parent->spilled_regs[i].live & REG_LIVE_READ) + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; - if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN)) + if (writes && + (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) continue; - if (state->spilled_regs[i].live & REG_LIVE_READ) { - parent->spilled_regs[i].live |= REG_LIVE_READ; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { + parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; touched = true; } } @@ -3513,6 +3640,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; + struct bpf_verifier_state *cur = env->cur_state; int i; sl = env->explored_states[insn_idx]; @@ -3523,7 +3651,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; while (sl != STATE_LIST_MARK) { - if (states_equal(env, &sl->state, &env->cur_state)) { + if (states_equal(env, &sl->state, cur)) { /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. @@ -3534,7 +3662,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, &env->cur_state); + propagate_liveness(&sl->state, cur); return 1; } sl = sl->next; @@ -3546,16 +3674,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * it will be rejected. Since there are no loops, we won't be * seeing this 'insn_idx' instruction again on the way to bpf_exit */ - new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); + new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; /* add new state to the head of linked list */ - memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); + copy_verifier_state(&new_sl->state, cur); new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - env->cur_state.parent = &new_sl->state; + cur->parent = &new_sl->state; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -3563,10 +3691,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - env->cur_state.regs[i].live = REG_LIVE_NONE; - for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) - if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL) - env->cur_state.spilled_regs[i].live = REG_LIVE_NONE; + cur->regs[i].live = REG_LIVE_NONE; + for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) + if (cur->stack[i].slot_type[0] == STACK_SPILL) + cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; return 0; } @@ -3581,15 +3709,19 @@ static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, static int do_check(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs; int insn_cnt = env->prog->len; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; - init_reg_state(env, regs); + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + env->cur_state = state; + init_reg_state(env, state->regs); state->parent = NULL; insn_idx = 0; for (;;) { @@ -3637,7 +3769,7 @@ static int do_check(struct bpf_verifier_env *env) else verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env, &env->cur_state); + print_verifier_state(env, state); do_print_state = false; } @@ -3651,6 +3783,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + regs = cur_regs(env); if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -3818,8 +3951,10 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: - insn_idx = pop_stack(env, &prev_insn_idx); - if (insn_idx < 0) { + err = pop_stack(env, &prev_insn_idx, &insn_idx); + if (err < 0) { + if (err != -ENOENT) + return err; break; } else { do_print_state = true; @@ -4359,9 +4494,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); + free_verifier_state(env->cur_state); + env->cur_state = NULL; skip_full_check: - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); free_states(env); if (ret == 0) @@ -4464,9 +4601,11 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); + free_verifier_state(env->cur_state); + env->cur_state = NULL; skip_full_check: - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); free_states(env); mutex_unlock(&bpf_verifier_lock); -- cgit From 07c41a295c5f25928a7cb689fdec816bd0089fe8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 30 Oct 2017 13:50:22 -0700 Subject: bpf: avoid rcu_dereference inside bpf_event_mutex lock region During perf event attaching/detaching bpf programs, the tp_event->prog_array change is protected by the bpf_event_mutex lock in both attaching and deteching functions. Although tp_event->prog_array is a rcu pointer, rcu_derefrence is not needed to access it since mutex lock will guarantee ordering. Verified through "make C=2" that sparse locking check still happy with the new change. Also change the label name in perf_event_{attach,detach}_bpf_prog from "out" to "unlock" to reflect the code action after the label. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 136aa6bb0422..506efe6e8ed9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -769,20 +769,19 @@ int perf_event_attach_bpf_prog(struct perf_event *event, mutex_lock(&bpf_event_mutex); if (event->prog) - goto out; + goto unlock; - old_array = rcu_dereference_protected(event->tp_event->prog_array, - lockdep_is_held(&bpf_event_mutex)); + old_array = event->tp_event->prog_array; ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); if (ret < 0) - goto out; + goto unlock; /* set the new array to event->tp_event and set event->prog */ event->prog = prog; rcu_assign_pointer(event->tp_event->prog_array, new_array); bpf_prog_array_free(old_array); -out: +unlock: mutex_unlock(&bpf_event_mutex); return ret; } @@ -796,11 +795,9 @@ void perf_event_detach_bpf_prog(struct perf_event *event) mutex_lock(&bpf_event_mutex); if (!event->prog) - goto out; - - old_array = rcu_dereference_protected(event->tp_event->prog_array, - lockdep_is_held(&bpf_event_mutex)); + goto unlock; + old_array = event->tp_event->prog_array; ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); @@ -812,6 +809,6 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_put(event->prog); event->prog = NULL; -out: +unlock: mutex_unlock(&bpf_event_mutex); } -- cgit From 1969db47f8d0e800397abd4ee4e8d27d2b578587 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Nov 2017 00:08:04 -0700 Subject: bpf: fix verifier memory leaks fix verifier memory leaks Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f26f7ad124f..2bb6d6aa7085 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -341,10 +341,12 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size, return 0; } -static void free_verifier_state(struct bpf_verifier_state *state) +static void free_verifier_state(struct bpf_verifier_state *state, + bool free_self) { kfree(state->stack); - kfree(state); + if (free_self) + kfree(state); } /* copy verifier state from src to dst growing dst stack space @@ -382,6 +384,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, if (prev_insn_idx) *prev_insn_idx = head->prev_insn_idx; elem = head->next; + free_verifier_state(&head->st, false); kfree(head); env->head = elem; env->stack_size--; @@ -399,14 +402,14 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, if (!elem) goto err; - err = copy_verifier_state(&elem->st, cur); - if (err) - return NULL; elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; env->head = elem; env->stack_size++; + err = copy_verifier_state(&elem->st, cur); + if (err) + goto err; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose(env, "BPF program is too complex\n"); goto err; @@ -3641,7 +3644,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state; - int i; + int i, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -3679,7 +3682,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return -ENOMEM; /* add new state to the head of linked list */ - copy_verifier_state(&new_sl->state, cur); + err = copy_verifier_state(&new_sl->state, cur); + if (err) { + free_verifier_state(&new_sl->state, false); + kfree(new_sl); + return err; + } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ @@ -4424,6 +4432,7 @@ static void free_states(struct bpf_verifier_env *env) if (sl) while (sl != STATE_LIST_MARK) { sln = sl->next; + free_verifier_state(&sl->state, false); kfree(sl); sl = sln; } @@ -4494,7 +4503,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); - free_verifier_state(env->cur_state); + free_verifier_state(env->cur_state, true); env->cur_state = NULL; skip_full_check: @@ -4601,7 +4610,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); - free_verifier_state(env->cur_state); + free_verifier_state(env->cur_state, true); env->cur_state = NULL; skip_full_check: -- cgit From 03c4cc385faaa46e5877f499c6b997fef792f8d3 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 1 Nov 2017 12:44:45 +0100 Subject: bpf: cpumap micro-optimization in cpu_map_enqueue Discovered that the compiler laid-out asm code in suboptimal way when studying perf report during benchmarking of cpumap. Help the compiler by the marking unlikely code paths. Signed-off-by: Jesper Dangaard Brouer Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 86e29cbf7827..ce5b669003b2 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -208,7 +208,7 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; - if ((headroom - metasize) < sizeof(*xdp_pkt)) + if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) return NULL; /* Store info in top of packet */ @@ -656,7 +656,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct xdp_pkt *xdp_pkt; xdp_pkt = convert_to_xdp_pkt(xdp); - if (!xdp_pkt) + if (unlikely(!xdp_pkt)) return -EOVERFLOW; /* Info needed when constructing SKB on remote CPU */ -- cgit From b06723da824af1e979eb1699623881b5f45a783c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 1 Nov 2017 23:58:09 +0100 Subject: bpf: minor cleanups after merge Two minor cleanups after Dave's recent merge in f8ddadc4db6c ("Merge git://git.kernel.org...") of net into net-next in order to get the code in line with what was done originally in the net tree: i) use max() instead of max_t() since both ranges are u16, ii) don't split the direct access test cases in the middle with bpf_exit test cases from 390ee7e29fc ("bpf: enforce return code for cgroup-bpf programs"). Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2bb6d6aa7085..2cc3e9486a1f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2532,7 +2532,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, continue; reg = &state->stack[i].spilled_ptr; if (reg->type == type && reg->id == dst_reg->id) - reg->range = max_t(u16, reg->range, new_range); + reg->range = max(reg->range, new_range); } } -- cgit From 5beca081be9195b4316ac5f32921df0234ee8e0e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 1 Nov 2017 23:58:10 +0100 Subject: bpf: also improve pattern matches for meta access Follow-up to 0fd4759c5515 ("bpf: fix pattern matches for direct packet access") to cover also the remaining data_meta/data matches in the verifier. The matches are also refactored a bit to simplify handling of all the cases. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 165 +++++++++++++++++++++++++++++--------------------- 1 file changed, 96 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cc3e9486a1f..530b68550fd2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2787,6 +2787,99 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, } } +static bool try_match_pkt_pointers(const struct bpf_insn *insn, + struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg, + struct bpf_verifier_state *this_branch, + struct bpf_verifier_state *other_branch) +{ + if (BPF_SRC(insn->code) != BPF_X) + return false; + + switch (BPF_OP(insn->code)) { + case BPF_JGT: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' > pkt_end, pkt_meta' > pkt_data */ + find_good_pkt_pointers(this_branch, dst_reg, + dst_reg->type, false); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end > pkt_data', pkt_data > pkt_meta' */ + find_good_pkt_pointers(other_branch, src_reg, + src_reg->type, true); + } else { + return false; + } + break; + case BPF_JLT: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' < pkt_end, pkt_meta' < pkt_data */ + find_good_pkt_pointers(other_branch, dst_reg, + dst_reg->type, true); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end < pkt_data', pkt_data > pkt_meta' */ + find_good_pkt_pointers(this_branch, src_reg, + src_reg->type, false); + } else { + return false; + } + break; + case BPF_JGE: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ + find_good_pkt_pointers(this_branch, dst_reg, + dst_reg->type, true); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ + find_good_pkt_pointers(other_branch, src_reg, + src_reg->type, false); + } else { + return false; + } + break; + case BPF_JLE: + if ((dst_reg->type == PTR_TO_PACKET && + src_reg->type == PTR_TO_PACKET_END) || + (dst_reg->type == PTR_TO_PACKET_META && + reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { + /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ + find_good_pkt_pointers(other_branch, dst_reg, + dst_reg->type, false); + } else if ((dst_reg->type == PTR_TO_PACKET_END && + src_reg->type == PTR_TO_PACKET) || + (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && + src_reg->type == PTR_TO_PACKET_META)) { + /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ + find_good_pkt_pointers(this_branch, src_reg, + src_reg->type, true); + } else { + return false; + } + break; + default: + return false; + } + + return true; +} + static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { @@ -2893,75 +2986,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE); mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' > pkt_end */ - find_good_pkt_pointers(this_branch, dst_reg, - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end > pkt_data' */ - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], - PTR_TO_PACKET, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' < pkt_end */ - find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET, - true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end < pkt_data' */ - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' >= pkt_end */ - find_good_pkt_pointers(this_branch, dst_reg, - PTR_TO_PACKET, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end >= pkt_data' */ - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - dst_reg->type == PTR_TO_PACKET && - regs[insn->src_reg].type == PTR_TO_PACKET_END) { - /* pkt_data' <= pkt_end */ - find_good_pkt_pointers(other_branch, dst_reg, - PTR_TO_PACKET, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - dst_reg->type == PTR_TO_PACKET_END && - regs[insn->src_reg].type == PTR_TO_PACKET) { - /* pkt_end <= pkt_data' */ - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], - PTR_TO_PACKET, true); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && - dst_reg->type == PTR_TO_PACKET_META && - reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { - find_good_pkt_pointers(this_branch, dst_reg, - PTR_TO_PACKET_META, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT && - dst_reg->type == PTR_TO_PACKET_META && - reg_is_init_pkt_pointer(®s[insn->src_reg], PTR_TO_PACKET)) { - find_good_pkt_pointers(other_branch, dst_reg, - PTR_TO_PACKET_META, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE && - reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && - regs[insn->src_reg].type == PTR_TO_PACKET_META) { - find_good_pkt_pointers(other_branch, ®s[insn->src_reg], - PTR_TO_PACKET_META, false); - } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE && - reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && - regs[insn->src_reg].type == PTR_TO_PACKET_META) { - find_good_pkt_pointers(this_branch, ®s[insn->src_reg], - PTR_TO_PACKET_META, false); - } else if (is_pointer_value(env, insn->dst_reg)) { + } else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], + this_branch, other_branch) && + is_pointer_value(env, insn->dst_reg)) { verbose(env, "R%d pointer comparison prohibited\n", insn->dst_reg); return -EACCES; -- cgit From 7cce782ef32f5003c18ab8f9f586f2ed5ce2c33e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Nov 2017 12:05:51 +0100 Subject: bpf: fix link error without CONFIG_NET I ran into this link error with the latest net-next plus linux-next trees when networking is disabled: kernel/bpf/verifier.o:(.rodata+0x2958): undefined reference to `tc_cls_act_analyzer_ops' kernel/bpf/verifier.o:(.rodata+0x2970): undefined reference to `xdp_analyzer_ops' It seems that the code was written to deal with varying contents of the arrray, but the actual #ifdef was missing. Both tc_cls_act_analyzer_ops and xdp_analyzer_ops are defined in the core networking code, so adding a check for CONFIG_NET seems appropriate here, and I've verified this with many randconfig builds Fixes: 4f9218aaf8a4 ("bpf: move knowledge about post-translation offsets out of verifier") Signed-off-by: Arnd Bergmann Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 530b68550fd2..5f3799dcba01 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4588,8 +4588,10 @@ err_free_env: } static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = { +#ifdef CONFIG_NET [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops, [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops, +#endif }; int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, -- cgit From eba0c929d1d0f16c4b03628b7bf8ce363b9e5c9a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Nov 2017 12:05:52 +0100 Subject: bpf: fix out-of-bounds access warning in bpf_check The bpf_verifer_ops array is generated dynamically and may be empty depending on configuration, which then causes an out of bounds access: kernel/bpf/verifier.c: In function 'bpf_check': kernel/bpf/verifier.c:4320:29: error: array subscript is above array bounds [-Werror=array-bounds] This adds a check to the start of the function as a workaround. I would assume that the function is never called in that configuration, so the warning is probably harmless. Fixes: 00176a34d9e2 ("bpf: remove the verifier ops from program structure") Signed-off-by: Arnd Bergmann Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f3799dcba01..ab5aa5497666 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4474,6 +4474,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) struct bpf_verifer_log *log; int ret = -EINVAL; + /* no program is valid */ + if (ARRAY_SIZE(bpf_verifier_ops) == 0) + return -EINVAL; + /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ -- cgit From 8c01c4f896aa3404af948880dcb29a2d51c833dc Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Thu, 2 Nov 2017 11:18:01 -0400 Subject: bpf: fix verifier NULL pointer dereference do_check() can fail early without allocating env->cur_state under memory pressure. Syzkaller found the stack below on the linux-next tree because of this. kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 27062 Comm: syz-executor5 Not tainted 4.14.0-rc7+ #106 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801c2c74700 task.stack: ffff8801c3e28000 RIP: 0010:free_verifier_state kernel/bpf/verifier.c:347 [inline] RIP: 0010:bpf_check+0xcf4/0x19c0 kernel/bpf/verifier.c:4533 RSP: 0018:ffff8801c3e2f5c8 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 00000000fffffff4 RCX: 0000000000000000 RDX: 0000000000000070 RSI: ffffffff817d5aa9 RDI: 0000000000000380 RBP: ffff8801c3e2f668 R08: 0000000000000000 R09: 1ffff100387c5d9f R10: 00000000218c4e80 R11: ffffffff85b34380 R12: ffff8801c4dc6a28 R13: 0000000000000000 R14: ffff8801c4dc6a00 R15: ffff8801c4dc6a20 FS: 00007f311079b700(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004d4a24 CR3: 00000001cbcd0000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: bpf_prog_load+0xcbb/0x18e0 kernel/bpf/syscall.c:1166 SYSC_bpf kernel/bpf/syscall.c:1690 [inline] SyS_bpf+0xae9/0x4620 kernel/bpf/syscall.c:1652 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x452869 RSP: 002b:00007f311079abe8 EFLAGS: 00000212 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 0000000000758020 RCX: 0000000000452869 RDX: 0000000000000030 RSI: 0000000020168000 RDI: 0000000000000005 RBP: 00007f311079aa20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000212 R12: 00000000004b7550 R13: 00007f311079ab58 R14: 00000000004b7560 R15: 0000000000000000 Code: df 48 c1 ea 03 80 3c 02 00 0f 85 e6 0b 00 00 4d 8b 6e 20 48 b8 00 00 00 00 00 fc ff df 49 8d bd 80 03 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 b6 0b 00 00 49 8b bd 80 03 00 00 e8 d6 0c 26 RIP: free_verifier_state kernel/bpf/verifier.c:347 [inline] RSP: ffff8801c3e2f5c8 RIP: bpf_check+0xcf4/0x19c0 kernel/bpf/verifier.c:4533 RSP: ffff8801c3e2f5c8 ---[ end trace c8d37f339dc64004 ]--- Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption") Fixes: 1969db47f8d0 ("bpf: fix verifier memory leaks") Signed-off-by: Craig Gallek Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ab5aa5497666..04357ad5a812 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4534,8 +4534,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } skip_full_check: while (!pop_stack(env, NULL, NULL)); @@ -4643,8 +4645,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } skip_full_check: while (!pop_stack(env, NULL, NULL)); -- cgit From edbfd9112f70c34b2965580a67dad5fb306fb6c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Nov 2017 07:02:15 -0700 Subject: Revert "workqueue: respect isolated cpus when queueing an unbound work" This reverts commit b5149873a0c299195b5346fe4dc2c5b04ae2f995. It conflicts with the following isolcpus change from the sched branch. edb9382175c3 ("sched/isolation: Move isolcpus= handling to the housekeeping code") Let's revert for now. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bfa433b38a61..64d0edf428f8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4980,10 +4980,6 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; - /* - * Not excluding isolated cpus on purpose. - * If the user wishes to include them, we allow that. - */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); @@ -5583,7 +5579,7 @@ int __init workqueue_init_early(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_andnot(wq_unbound_cpumask, cpu_possible_mask, cpu_isolated_map); + cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit From 2d2123bc7c7f843aa9db87720de159a049839862 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 31 Oct 2017 15:51:14 +0000 Subject: arm64/sve: Add prctl controls for userspace vector length management MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds two arm64-specific prctls, to permit userspace to control its vector length: * PR_SVE_SET_VL: set the thread's SVE vector length and vector length inheritance mode. * PR_SVE_GET_VL: get the same information. Although these prctls resemble instruction set features in the SVE architecture, they provide additional control: the vector length inheritance mode is Linux-specific and nothing to do with the architecture, and the architecture does not permit EL0 to set its own vector length directly. Both can be used in portable tools without requiring the use of SVE instructions. Signed-off-by: Dave Martin Reviewed-by: Catalin Marinas Cc: Alex Bennée [will: Fixed up prctl constants to avoid clash with PDEATHSIG] Signed-off-by: Will Deacon --- kernel/sys.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 9aebc2935013..c541916b38c6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -110,6 +110,12 @@ #ifndef SET_FP_MODE # define SET_FP_MODE(a,b) (-EINVAL) #endif +#ifndef SVE_SET_VL +# define SVE_SET_VL(a) (-EINVAL) +#endif +#ifndef SVE_GET_VL +# define SVE_GET_VL() (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2385,6 +2391,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; + case PR_SVE_SET_VL: + error = SVE_SET_VL(arg2); + break; + case PR_SVE_GET_VL: + error = SVE_GET_VL(); + break; default: error = -EINVAL; break; -- cgit From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:17 -0700 Subject: bpf: offload: add infrastructure for loading programs for a specific netdev The fact that we don't know which device the program is going to be used on is quite limiting in current eBPF infrastructure. We have to reverse or limit the changes which kernel makes to the loaded bytecode if we want it to be offloaded to a networking device. We also have to invent new APIs for debugging and troubleshooting support. Make it possible to load programs for a specific netdev. This helps us to bring the debug information closer to the core eBPF infrastructure (e.g. we will be able to reuse the verifer log in device JIT). It allows device JITs to perform translation on the original bytecode. __bpf_prog_get() when called to get a reference for an attachment point will now refuse to give it if program has a device assigned. Following patches will add a version of that function which passes the expected netdev in. @type argument in __bpf_prog_get() is renamed to attach_type to make it clearer that it's only set on attachment. All calls to ndo_bpf are protected by rtnl, only verifier callbacks are not. We need a wait queue to make sure netdev doesn't get destroyed while verifier is still running and calling its driver. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 1 + kernel/bpf/core.c | 10 ++- kernel/bpf/offload.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 17 +++-- kernel/bpf/verifier.c | 15 ++++- 5 files changed, 217 insertions(+), 8 deletions(-) create mode 100644 kernel/bpf/offload.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 16e95c8e749e..e691da0b3bab 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7fe448799d76..8a6c37762330 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1380,7 +1380,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ - fp = bpf_int_jit_compile(fp); + if (!bpf_prog_is_dev_bound(fp->aux)) { + fp = bpf_int_jit_compile(fp); + } else { + *err = bpf_prog_offload_compile(fp); + if (*err) + return fp; + } bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -1549,6 +1555,8 @@ static void bpf_prog_free_deferred(struct work_struct *work) struct bpf_prog_aux *aux; aux = container_of(work, struct bpf_prog_aux, work); + if (bpf_prog_is_dev_bound(aux)) + bpf_prog_offload_destroy(aux->prog); bpf_jit_free(aux->prog); } diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c new file mode 100644 index 000000000000..5553e0e2f8b1 --- /dev/null +++ b/kernel/bpf/offload.c @@ -0,0 +1,182 @@ +#include +#include +#include +#include +#include +#include +#include + +/* protected by RTNL */ +static LIST_HEAD(bpf_prog_offload_devs); + +int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_dev_offload *offload; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->prog_flags) + return -EINVAL; + + offload = kzalloc(sizeof(*offload), GFP_USER); + if (!offload) + return -ENOMEM; + + offload->prog = prog; + init_waitqueue_head(&offload->verifier_done); + + rtnl_lock(); + offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex); + if (!offload->netdev) { + rtnl_unlock(); + kfree(offload); + return -EINVAL; + } + + prog->aux->offload = offload; + list_add_tail(&offload->offloads, &bpf_prog_offload_devs); + rtnl_unlock(); + + return 0; +} + +static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, + struct netdev_bpf *data) +{ + struct net_device *netdev = prog->aux->offload->netdev; + + ASSERT_RTNL(); + + if (!netdev) + return -ENODEV; + if (!netdev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + data->command = cmd; + + return netdev->netdev_ops->ndo_bpf(netdev, data); +} + +int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +{ + struct netdev_bpf data = {}; + int err; + + data.verifier.prog = env->prog; + + rtnl_lock(); + err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data); + if (err) + goto exit_unlock; + + env->dev_ops = data.verifier.ops; + + env->prog->aux->offload->dev_state = true; + env->prog->aux->offload->verifier_running = true; +exit_unlock: + rtnl_unlock(); + return err; +} + +static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + struct netdev_bpf data = {}; + + data.offload.prog = prog; + + if (offload->verifier_running) + wait_event(offload->verifier_done, !offload->verifier_running); + + if (offload->dev_state) + WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + + offload->dev_state = false; + list_del_init(&offload->offloads); + offload->netdev = NULL; +} + +void bpf_prog_offload_destroy(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + + offload->verifier_running = false; + wake_up(&offload->verifier_done); + + rtnl_lock(); + __bpf_prog_offload_destroy(prog); + rtnl_unlock(); + + kfree(offload); +} + +static int bpf_prog_offload_translate(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + struct netdev_bpf data = {}; + int ret; + + data.offload.prog = prog; + + offload->verifier_running = false; + wake_up(&offload->verifier_done); + + rtnl_lock(); + ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); + rtnl_unlock(); + + return ret; +} + +static unsigned int bpf_prog_warn_on_exec(const void *ctx, + const struct bpf_insn *insn) +{ + WARN(1, "attempt to execute device eBPF program on the host!"); + return 0; +} + +int bpf_prog_offload_compile(struct bpf_prog *prog) +{ + prog->bpf_func = bpf_prog_warn_on_exec; + + return bpf_prog_offload_translate(prog); +} + +const struct bpf_prog_ops bpf_offload_prog_ops = { +}; + +static int bpf_offload_notification(struct notifier_block *notifier, + ulong event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct bpf_dev_offload *offload, *tmp; + + ASSERT_RTNL(); + + switch (event) { + case NETDEV_UNREGISTER: + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, + offloads) { + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block bpf_offload_notifier = { + .notifier_call = bpf_offload_notification, +}; + +static int __init bpf_offload_init(void) +{ + register_netdevice_notifier(&bpf_offload_notifier); + return 0; +} + +subsys_initcall(bpf_offload_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 323be2473c4b..1574b9f0f24e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -824,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) return -EINVAL; - prog->aux->ops = bpf_prog_types[type]; + if (!bpf_prog_is_dev_bound(prog->aux)) + prog->aux->ops = bpf_prog_types[type]; + else + prog->aux->ops = &bpf_offload_prog_ops; prog->type = type; return 0; } @@ -1054,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -1062,7 +1065,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type) prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (type && prog->type != *type) { + if (attach_type && (prog->type != *attach_type || prog->aux->offload)) { prog = ERR_PTR(-EINVAL); goto out; } @@ -1089,7 +1092,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_name +#define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex static int bpf_prog_load(union bpf_attr *attr) { @@ -1152,6 +1155,12 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; + if (attr->prog_target_ifindex) { + err = bpf_prog_offload_init(prog, attr); + if (err) + goto free_prog; + } + /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04357ad5a812..51aabb32ad67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3736,10 +3736,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - if (!env->analyzer_ops || !env->analyzer_ops->insn_hook) - return 0; + if (env->analyzer_ops && env->analyzer_ops->insn_hook) + return env->analyzer_ops->insn_hook(env, insn_idx, + prev_insn_idx); + if (env->dev_ops && env->dev_ops->insn_hook) + return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); - return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx); + return 0; } static int do_check(struct bpf_verifier_env *env) @@ -4516,6 +4519,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; + if (env->prog->aux->offload) { + ret = bpf_prog_offload_verifier_prep(env); + if (ret) + goto err_unlock; + } + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; -- cgit From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:18 -0700 Subject: bpf: report offload info to user space Extend struct bpf_prog_info to contain information about program being bound to a device. Since the netdev may get destroyed while program still exists we need a flag to indicate the program is loaded for a device, even if the device is gone. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/offload.c | 12 ++++++++++++ kernel/bpf/syscall.c | 5 +++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 5553e0e2f8b1..2816feb38be1 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -144,6 +144,18 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } +u32 bpf_prog_offload_ifindex(struct bpf_prog *prog) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + u32 ifindex; + + rtnl_lock(); + ifindex = offload->netdev ? offload->netdev->ifindex : 0; + rtnl_unlock(); + + return ifindex; +} + const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1574b9f0f24e..3217c20ea91b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1592,6 +1592,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + info.status |= BPF_PROG_STATUS_DEV_BOUND; + info.ifindex = bpf_prog_offload_ifindex(prog); + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) -- cgit From 248f346ffe9508dee0039db4ac839cb31ba3bdec Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:20 -0700 Subject: xdp: allow attaching programs loaded for specific device Pass the netdev pointer to bpf_prog_get_type(). This way BPF code can decide whether the device matches what the code was loaded/translated for. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3217c20ea91b..68f9123acd39 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1057,7 +1057,22 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) +static bool bpf_prog_can_attach(struct bpf_prog *prog, + enum bpf_prog_type *attach_type, + struct net_device *netdev) +{ + struct bpf_dev_offload *offload = prog->aux->offload; + + if (prog->type != *attach_type) + return false; + if (offload && offload->netdev != netdev) + return false; + + return true; +} + +static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, + struct net_device *netdev) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -1065,7 +1080,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type) prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (attach_type && (prog->type != *attach_type || prog->aux->offload)) { + if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) { prog = ERR_PTR(-EINVAL); goto out; } @@ -1078,12 +1093,12 @@ out: struct bpf_prog *bpf_prog_get(u32 ufd) { - return __bpf_prog_get(ufd, NULL); + return __bpf_prog_get(ufd, NULL, NULL); } struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL); if (!IS_ERR(prog)) trace_bpf_prog_get_type(prog); @@ -1091,6 +1106,16 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) } EXPORT_SYMBOL_GPL(bpf_prog_get_type); +struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, + struct net_device *netdev) +{ + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev); + + if (!IS_ERR(prog)) + trace_bpf_prog_get_type(prog); + return prog; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex -- cgit From 6c8dfe21c435cf2953e3cee43e12180cbc4f0820 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:21 -0700 Subject: cls_bpf: allow attaching programs loaded for specific device If TC program is loaded with skip_sw flag, we should allow the device-specific programs to be accepted. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 68f9123acd39..416d70cdfc76 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1115,6 +1115,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, trace_bpf_prog_get_type(prog); return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex -- cgit From b37a530613104aa3f592376c67a462823298759c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Nov 2017 13:56:30 -0700 Subject: bpf: remove old offload/analyzer Thanks to the ability to load a program for a specific device, running verifier twice is no longer needed. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 75 --------------------------------------------------- 1 file changed, 75 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 51aabb32ad67..add845fe788a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -949,9 +949,6 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (env->analyzer_ops) - return 0; - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) @@ -3736,9 +3733,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - if (env->analyzer_ops && env->analyzer_ops->insn_hook) - return env->analyzer_ops->insn_hook(env, insn_idx, - prev_insn_idx); if (env->dev_ops && env->dev_ops->insn_hook) return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); @@ -4601,72 +4595,3 @@ err_free_env: kfree(env); return ret; } - -static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = { -#ifdef CONFIG_NET - [BPF_PROG_TYPE_XDP] = &xdp_analyzer_ops, - [BPF_PROG_TYPE_SCHED_CLS] = &tc_cls_act_analyzer_ops, -#endif -}; - -int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, - void *priv) -{ - struct bpf_verifier_env *env; - int ret; - - if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) || - !bpf_analyzer_ops[prog->type]) - return -EOPNOTSUPP; - - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); - if (!env) - return -ENOMEM; - - env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * - prog->len); - ret = -ENOMEM; - if (!env->insn_aux_data) - goto err_free_env; - env->prog = prog; - env->ops = bpf_analyzer_ops[env->prog->type]; - env->analyzer_ops = ops; - env->analyzer_priv = priv; - - /* grab the mutex to protect few globals used by verifier */ - mutex_lock(&bpf_verifier_lock); - - env->strict_alignment = false; - if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) - env->strict_alignment = true; - - env->explored_states = kcalloc(env->prog->len, - sizeof(struct bpf_verifier_state_list *), - GFP_KERNEL); - ret = -ENOMEM; - if (!env->explored_states) - goto skip_full_check; - - ret = check_cfg(env); - if (ret < 0) - goto skip_full_check; - - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - - ret = do_check(env); - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } - -skip_full_check: - while (!pop_stack(env, NULL, NULL)); - free_states(env); - - mutex_unlock(&bpf_verifier_lock); - vfree(env->insn_aux_data); -err_free_env: - kfree(env); - return ret; -} -EXPORT_SYMBOL_GPL(bpf_analyzer); -- cgit From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sun, 5 Nov 2017 08:15:32 -0500 Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2 Cgroup v2 lacks the device controller, provided by cgroup v1. This patch adds a new eBPF program type, which in combination of previously added ability to attach multiple eBPF programs to a cgroup, will provide a similar functionality, but with some additional flexibility. This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type. A program takes major and minor device numbers, device type (block/character) and access type (mknod/read/write) as parameters and returns an integer which defines if the operation should be allowed or terminated with -EPERM. Signed-off-by: Roman Gushchin Acked-by: Alexei Starovoitov Acked-by: Tejun Heo Cc: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/cgroup.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 7 ++++++ kernel/bpf/verifier.c | 1 + 3 files changed, 75 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 3db5a17fcfe8..b789ab78d28f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -522,3 +522,70 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, return ret == 1 ? 0 : -EPERM; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); + +int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, + short access, enum bpf_attach_type type) +{ + struct cgroup *cgrp; + struct bpf_cgroup_dev_ctx ctx = { + .access_type = (access << 16) | dev_type, + .major = major, + .minor = minor, + }; + int allow = 1; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN); + rcu_read_unlock(); + + return !allow; +} +EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); + +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_trace_printk: + if (capable(CAP_SYS_ADMIN)) + return bpf_get_trace_printk_proto(); + default: + return NULL; + } +} + +static bool cgroup_dev_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +const struct bpf_prog_ops cg_dev_prog_ops = { +}; + +const struct bpf_verifier_ops cg_dev_verifier_ops = { + .get_func_proto = cgroup_dev_func_proto, + .is_valid_access = cgroup_dev_is_valid_access, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 416d70cdfc76..09badc37e864 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1326,6 +1326,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_DEVICE: + ptype = BPF_PROG_TYPE_CGROUP_DEVICE; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, true); @@ -1378,6 +1381,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_DEVICE: + ptype = BPF_PROG_TYPE_CGROUP_DEVICE; + break; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, false); @@ -1420,6 +1426,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_DEVICE: break; default: return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index add845fe788a..4a942e2e753d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3124,6 +3124,7 @@ static int check_return_code(struct bpf_verifier_env *env) case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: break; default: return 0; -- cgit From 9a19b463863e757e649c37af245b6af101410c1e Mon Sep 17 00:00:00 2001 From: Wang Long Date: Thu, 2 Nov 2017 23:05:12 -0400 Subject: workqueue: Fix comment for unbound workqueue's attrbutes Signed-off-by: Wang Long Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 64d0edf428f8..5f99851bff09 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5013,9 +5013,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) * * Unbound workqueues have the following extra attributes. * - * id RO int : the associated pool ID + * pool_ids RO int : the associated pool IDs for each node * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers + * numa RW bool : whether enable NUMA affinity */ struct wq_device { struct workqueue_struct *wq; -- cgit From 01ee6cfb1483fe57c9cbd8e73817dfbf9bacffd3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 6 Nov 2017 13:30:28 -0500 Subject: cgroup: export list of delegatable control files using sysfs Delegatable cgroup v2 control files may require special handling (e.g. chowning), and the exact list of such files varies between kernel versions (and likely to be extended in the future). To guarantee correctness of this list and simplify the life of userspace (systemd, first of all), let's export the list via /sys/kernel/cgroup/delegate pseudo-file. Format is siple: each control file name is printed on a new line. Example: $ cat /sys/kernel/cgroup/delegate cgroup.procs cgroup.subtree_control Signed-off-by: Roman Gushchin Cc: Tejun Heo Cc: kernel-team@fb.com Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d6ed725f36d9..eed92ed624e5 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5832,3 +5832,64 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog, return ret; } #endif /* CONFIG_CGROUP_BPF */ + +#ifdef CONFIG_SYSFS +static ssize_t show_delegatable_files(struct cftype *files, char *buf, + ssize_t size, const char *prefix) +{ + struct cftype *cft; + ssize_t ret = 0; + + for (cft = files; cft && cft->name[0] != '\0'; cft++) { + if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) + continue; + + if (prefix) + ret += snprintf(buf + ret, size - ret, "%s.", prefix); + + ret += snprintf(buf + ret, size - ret, "%s\n", cft->name); + + if (unlikely(ret >= size)) { + WARN_ON(1); + break; + } + } + + return ret; +} + +static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct cgroup_subsys *ss; + int ssid; + ssize_t ret = 0; + + ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, + NULL); + + for_each_subsys(ss, ssid) + ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, + PAGE_SIZE - ret, + cgroup_subsys_name[ssid]); + + return ret; +} +static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); + +static struct attribute *cgroup_sysfs_attrs[] = { + &cgroup_delegate_attr.attr, + NULL, +}; + +static const struct attribute_group cgroup_sysfs_attr_group = { + .attrs = cgroup_sysfs_attrs, + .name = "cgroup", +}; + +static int __init cgroup_sysfs_init(void) +{ + return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); +} +subsys_initcall(cgroup_sysfs_init); +#endif /* CONFIG_SYSFS */ -- cgit From 5f2e673405b742be64e7c3604ed4ed3ac14f35ce Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 6 Nov 2017 13:30:29 -0500 Subject: cgroup: export list of cgroups v2 features using sysfs The active development of cgroups v2 sometimes leads to a creation of interfaces, which are not turned on by default (to provide backward compatibility). It's handy to know from userspace, which cgroup v2 features are supported without calculating it based on the kernel version. So, let's export the list of such features using /sys/kernel/cgroup/features pseudo-file. The list is hardcoded and has to be extended when new functionality is added. Each feature is printed on a new line. Example: $ cat /sys/kernel/cgroup/features nsdelegate Signed-off-by: Roman Gushchin Cc: Tejun Heo Cc: kernel-team@fb.com Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index eed92ed624e5..69e65d28fe98 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5877,8 +5877,16 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, } static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); +static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); +} +static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); + static struct attribute *cgroup_sysfs_attrs[] = { &cgroup_delegate_attr.attr, + &cgroup_features_attr.attr, NULL, }; -- cgit From 95b982b45122c57da2ee0b46cce70775e1d987af Mon Sep 17 00:00:00 2001 From: Rajat Jain Date: Tue, 31 Oct 2017 14:44:24 -0700 Subject: PM / s2idle: Clear the events_check_enabled flag Problem: This flag does not get cleared currently in the suspend or resume path in the following cases: * In case some driver's suspend routine returns an error. * Successful s2idle case * etc? Why is this a problem: What happens is that the next suspend attempt could fail even though the user did not enable the flag by writing to /sys/power/wakeup_count. This is 1 use case how the issue can be seen (but similar use case with driver suspend failure can be thought of): 1. Read /sys/power/wakeup_count 2. echo count > /sys/power/wakeup_count 3. echo freeze > /sys/power/wakeup_count 4. Let the system suspend, and wakeup the system using some wake source that calls pm_wakeup_event() e.g. power button or something. 5. Note that the combined wakeup count would be incremented due to the pm_wakeup_event() in the resume path. 6. After resuming the events_check_enabled flag is still set. At this point if the user attempts to freeze again (without writing to /sys/power/wakeup_count), the suspend would fail even though there has been no wake event since the past resume. Address that by clearing the flag just before a resume is completed, so that it is always cleared for the corner cases mentioned above. Signed-off-by: Rajat Jain Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ccd2d20e6b06..0685c4499431 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -437,7 +437,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = suspend_ops->enter(state); trace_suspend_resume(TPS("machine_suspend"), state, false); - events_check_enabled = false; } else if (*wakeup) { error = -EBUSY; } @@ -582,6 +581,7 @@ static int enter_state(suspend_state_t state) pm_restore_gfp_mask(); Finish: + events_check_enabled = false; pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: -- cgit From 07458f6a5171d97511dfbdf6ce549ed2ca0280c7 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 8 Nov 2017 20:23:55 +0530 Subject: cpufreq: schedutil: Reset cached_raw_freq when not in sync with next_freq 'cached_raw_freq' is used to get the next frequency quickly but should always be in sync with sg_policy->next_freq. There is a case where it is not and in such cases it should be reset to avoid switching to incorrect frequencies. Consider this case for example: - policy->cur is 1.2 GHz (Max) - New request comes for 780 MHz and we store that in cached_raw_freq. - Based on 780 MHz, we calculate the effective frequency as 800 MHz. - We then see the CPU wasn't idle recently and choose to keep the next freq as 1.2 GHz. - Now we have cached_raw_freq is 780 MHz and sg_policy->next_freq is 1.2 GHz. - Now if the utilization doesn't change in then next request, then the next target frequency will still be 780 MHz and it will match with cached_raw_freq. But we will choose 1.2 GHz instead of 800 MHz here. Fixes: b7eaf1aab9f8 (cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely) Signed-off-by: Viresh Kumar Cc: 4.12+ # 4.12+ Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index ba0da243fdd8..2f52ec0f1539 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -282,8 +282,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq) + if (busy && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; + + /* Reset cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = 0; + } } sugov_update_commit(sg_policy, time, next_f); } -- cgit From 173743dd99a49c956b124a74c8aacb0384739a4c Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 1 Sep 2017 09:44:34 -0400 Subject: audit: ensure that 'audit=1' actually enables audit for PID 1 Prior to this patch we enabled audit in audit_init(), which is too late for PID 1 as the standard initcalls are run after the PID 1 task is forked. This means that we never allocate an audit_context (see audit_alloc()) for PID 1 and therefore miss a lot of audit events generated by PID 1. This patch enables audit as early as possible to help ensure that when PID 1 is forked it can allocate an audit_context if required. Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index be1c28fd4d57..ec3d0802734d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -85,13 +85,13 @@ static int audit_initialized; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 -u32 audit_enabled; -u32 audit_ever_enabled; +u32 audit_enabled = AUDIT_OFF; +u32 audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ -static u32 audit_default; +static u32 audit_default = AUDIT_OFF; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; @@ -1549,8 +1549,6 @@ static int __init audit_init(void) register_pernet_subsys(&audit_net_ops); audit_initialized = AUDIT_INITIALIZED; - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); if (IS_ERR(kauditd_task)) { @@ -1572,6 +1570,8 @@ static int __init audit_enable(char *str) audit_default = !!simple_strtol(str, NULL, 0); if (!audit_default) audit_initialized = AUDIT_DISABLED; + audit_enabled = audit_default; + audit_ever_enabled = !!audit_enabled; pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); -- cgit From be4104abf25c63ccef36e1e06262c73c0df9fd60 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 1 Sep 2017 09:44:44 -0400 Subject: audit: initialize the audit subsystem as early as possible We can't initialize the audit subsystem until after the network layer is initialized (core_initcall), but do it soon after. Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index ec3d0802734d..db71c45ea6f8 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1562,7 +1562,7 @@ static int __init audit_init(void) return 0; } -__initcall(audit_init); +postcore_initcall(audit_init); /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ static int __init audit_enable(char *str) -- cgit From 80ab4df62706b882922c3bb0b05ce2c8ab10828a Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 1 Sep 2017 09:44:51 -0400 Subject: audit: don't use simple_strtol() anymore The simple_strtol() function is deprecated, use kstrtol() instead. Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index db71c45ea6f8..e75918f79a83 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1567,8 +1567,13 @@ postcore_initcall(audit_init); /* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ static int __init audit_enable(char *str) { - audit_default = !!simple_strtol(str, NULL, 0); - if (!audit_default) + long val; + + if (kstrtol(str, 0, &val)) + panic("audit: invalid 'audit' parameter value (%s)\n", str); + audit_default = (val ? AUDIT_ON : AUDIT_OFF); + + if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; audit_enabled = audit_default; audit_ever_enabled = !!audit_enabled; -- cgit From b3b4fdf6a8ae9ea51ea274e3f2f8a6a58b98cc0b Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 1 Sep 2017 09:44:57 -0400 Subject: audit: convert audit_ever_enabled to a boolean We were treating it as a boolean, let's make it a boolean to help avoid future mistakes. Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/audit.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e75918f79a83..f0cf9bfc806c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -86,7 +86,7 @@ static int audit_initialized; #define AUDIT_ON 1 #define AUDIT_LOCKED 2 u32 audit_enabled = AUDIT_OFF; -u32 audit_ever_enabled = !!AUDIT_OFF; +bool audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); diff --git a/kernel/audit.h b/kernel/audit.h index b331d9b83f63..6bdaf6bd377e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -208,7 +208,7 @@ struct audit_context { struct audit_proctitle proctitle; }; -extern u32 audit_ever_enabled; +extern bool audit_ever_enabled; extern void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, -- cgit From 5d842a5b77a58160625548fa6be2dc159179ffdb Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 1 Sep 2017 09:45:05 -0400 Subject: audit: use audit_set_enabled() in audit_enable() Use audit_set_enabled() to enable auditing during early boot. This obviously won't emit an audit change record, but it will work anyway and should help prevent in future problems by consolidating the enable/disable code in one function. Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index f0cf9bfc806c..67b3863261d4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1575,8 +1575,8 @@ static int __init audit_enable(char *str) if (audit_default == AUDIT_OFF) audit_initialized = AUDIT_DISABLED; - audit_enabled = audit_default; - audit_ever_enabled = !!audit_enabled; + if (audit_set_enabled(audit_default)) + panic("audit: error setting audit state (%d)\n", audit_default); pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); -- cgit From 33e8a907804428109ce1d12301c3365d619cc4df Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Tue, 17 Oct 2017 18:29:22 -0400 Subject: audit: Allow auditd to set pid to 0 to end auditing The API to end auditing has historically been for auditd to set the pid to 0. This patch restores that functionality. See: https://github.com/linux-audit/audit-kernel/issues/69 Reviewed-by: Richard Guy Briggs Signed-off-by: Steve Grubb Signed-off-by: Paul Moore --- kernel/audit.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 67b3863261d4..64e1d0ec19de 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1197,25 +1197,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid_t auditd_pid; struct pid *req_pid = task_tgid(current); - /* sanity check - PID values must match */ - if (new_pid != pid_vnr(req_pid)) + /* Sanity check - PID values must match. Setting + * pid to 0 is how auditd ends auditing. */ + if (new_pid && (new_pid != pid_vnr(req_pid))) return -EINVAL; /* test the auditd connection */ audit_replace(req_pid); auditd_pid = auditd_pid_vnr(); - /* only the current auditd can unregister itself */ - if ((!new_pid) && (new_pid != auditd_pid)) { - audit_log_config_change("audit_pid", new_pid, - auditd_pid, 0); - return -EACCES; - } - /* replacing a healthy auditd is not allowed */ - if (auditd_pid && new_pid) { - audit_log_config_change("audit_pid", new_pid, - auditd_pid, 0); - return -EEXIST; + if (auditd_pid) { + /* replacing a healthy auditd is not allowed */ + if (new_pid) { + audit_log_config_change("audit_pid", + new_pid, auditd_pid, 0); + return -EEXIST; + } + /* only current auditd can unregister itself */ + if (pid_vnr(req_pid) != auditd_pid) { + audit_log_config_change("audit_pid", + new_pid, auditd_pid, 0); + return -EACCES; + } } if (new_pid) { -- cgit From f7b53637c090bd8ce2dc74ad0f3aa1898aff2524 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 24 Oct 2017 18:52:31 -0700 Subject: Audit: remove unused audit_log_secctx function The function audit_log_secctx() is unused in the upstream kernel. All it does is wrap another function that doesn't need wrapping. It claims to give you the SELinux context, but that is not true if you are using a different security module. Signed-off-by: Casey Schaufler Reviewed-by: James Morris Signed-off-by: Paul Moore --- kernel/audit.c | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 64e1d0ec19de..227db99b0f19 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2345,32 +2345,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, } } -#ifdef CONFIG_SECURITY -/** - * audit_log_secctx - Converts and logs SELinux context - * @ab: audit_buffer - * @secid: security number - * - * This is a helper function that calls security_secid_to_secctx to convert - * secid to secctx and then adds the (converted) SELinux context to the audit - * log by calling audit_log_format, thus also preventing leak of internal secid - * to userspace. If secid cannot be converted audit_panic is called. - */ -void audit_log_secctx(struct audit_buffer *ab, u32 secid) -{ - u32 len; - char *secctx; - - if (security_secid_to_secctx(secid, &secctx, &len)) { - audit_panic("Cannot convert secid to context"); - } else { - audit_log_format(ab, " obj=%s", secctx); - security_release_secctx(secctx, len); - } -} -EXPORT_SYMBOL(audit_log_secctx); -#endif - EXPORT_SYMBOL(audit_log_start); EXPORT_SYMBOL(audit_log_end); EXPORT_SYMBOL(audit_log_format); -- cgit From 42d5e37654e4cdb9fb2e2f3ab30045fee35c42d8 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 23 Aug 2017 07:03:39 -0400 Subject: audit: filter PATH records keyed on filesystem magic Tracefs or debugfs were causing hundreds to thousands of PATH records to be associated with the init_module and finit_module SYSCALL records on a few modules when the following rule was in place for startup: -a always,exit -F arch=x86_64 -S init_module -F key=mod-load Provide a method to ignore these large number of PATH records from overwhelming the logs if they are not of interest. Introduce a new filter list "AUDIT_FILTER_FS", with a new field type AUDIT_FSTYPE, which keys off the filesystem 4-octet hexadecimal magic identifier to filter specific filesystem PATH records. An example rule would look like: -a never,filesystem -F fstype=0x74726163 -F key=ignore_tracefs -a never,filesystem -F fstype=0x64626720 -F key=ignore_debugfs Arguably the better way to address this issue is to disable tracefs and debugfs on boot from production systems. See: https://github.com/linux-audit/audit-kernel/issues/16 See: https://github.com/linux-audit/audit-userspace/issues/8 Test case: https://github.com/linux-audit/audit-testsuite/issues/42 Signed-off-by: Richard Guy Briggs [PM: fixed the whitespace damage in kernel/auditsc.c] Signed-off-by: Paul Moore --- kernel/auditfilter.c | 39 ++++++++++++++++++++++++++++++++------- kernel/auditsc.c | 23 +++++++++++++++++++++++ 2 files changed, 55 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 0b0aa5854dac..4a1758adb222 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -56,7 +56,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[3]), LIST_HEAD_INIT(audit_filter_list[4]), LIST_HEAD_INIT(audit_filter_list[5]), -#if AUDIT_NR_FILTERS != 6 + LIST_HEAD_INIT(audit_filter_list[6]), +#if AUDIT_NR_FILTERS != 7 #error Fix audit_filter_list initialiser #endif }; @@ -67,6 +68,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_rules_list[3]), LIST_HEAD_INIT(audit_rules_list[4]), LIST_HEAD_INIT(audit_rules_list[5]), + LIST_HEAD_INIT(audit_rules_list[6]), }; DEFINE_MUTEX(audit_filter_mutex); @@ -263,6 +265,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data * #endif case AUDIT_FILTER_USER: case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_FS: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { @@ -338,6 +341,21 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) entry->rule.listnr != AUDIT_FILTER_USER) return -EINVAL; break; + case AUDIT_FSTYPE: + if (entry->rule.listnr != AUDIT_FILTER_FS) + return -EINVAL; + break; + } + + switch(entry->rule.listnr) { + case AUDIT_FILTER_FS: + switch(f->type) { + case AUDIT_FSTYPE: + case AUDIT_FILTERKEY: + break; + default: + return -EINVAL; + } } switch(f->type) { @@ -391,6 +409,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return -EINVAL; /* FALL THROUGH */ case AUDIT_ARCH: + case AUDIT_FSTYPE: if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; break; @@ -910,10 +929,13 @@ static inline int audit_add_rule(struct audit_entry *entry) #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; - /* If either of these, don't count towards total */ - if (entry->rule.listnr == AUDIT_FILTER_USER || - entry->rule.listnr == AUDIT_FILTER_TYPE) + /* If any of these, don't count towards total */ + switch(entry->rule.listnr) { + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_FS: dont_count = 1; + } #endif mutex_lock(&audit_filter_mutex); @@ -989,10 +1011,13 @@ int audit_del_rule(struct audit_entry *entry) #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; - /* If either of these, don't count towards total */ - if (entry->rule.listnr == AUDIT_FILTER_USER || - entry->rule.listnr == AUDIT_FILTER_TYPE) + /* If any of these, don't count towards total */ + switch(entry->rule.listnr) { + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: + case AUDIT_FILTER_FS: dont_count = 1; + } #endif mutex_lock(&audit_filter_mutex); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index aac1a41f82bd..c9bb29e17335 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1869,10 +1869,33 @@ void __audit_inode_child(struct inode *parent, struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; + struct audit_entry *e; + struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; + int i; if (!context->in_syscall) return; + rcu_read_lock(); + if (!list_empty(list)) { + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE) { + if (audit_comparator(parent->i_sb->s_magic, + f->op, f->val)) { + if (e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; + } + } + } + } + } + } + rcu_read_unlock(); + if (inode) handle_one(inode); -- cgit From 1f2cac107c591c24b60b115d6050adc213d10fc0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 5 Nov 2017 09:13:48 -0700 Subject: blktrace: fix unlocked access to init/start-stop/teardown sg.c calls into the blktrace functions without holding the proper queue mutex for doing setup, start/stop, or teardown. Add internal unlocked variants, and export the ones that do the proper locking. Fixes: 6da127ad0918 ("blktrace: Add blktrace ioctls to SCSI generic devices") Tested-by: Dmitry Vyukov Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 58 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 45a3928544ce..ea57dd94b2b2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -336,7 +336,7 @@ static void blk_trace_cleanup(struct blk_trace *bt) blk_unregister_tracepoints(); } -int blk_trace_remove(struct request_queue *q) +static int __blk_trace_remove(struct request_queue *q) { struct blk_trace *bt; @@ -349,6 +349,17 @@ int blk_trace_remove(struct request_queue *q) return 0; } + +int blk_trace_remove(struct request_queue *q) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_remove(q); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_remove); static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, @@ -550,9 +561,8 @@ err: return ret; } -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - char __user *arg) +static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; int ret; @@ -571,6 +581,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } return 0; } + +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_setup(q, name, dev, bdev, arg); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_setup); #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) @@ -607,7 +630,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, } #endif -int blk_trace_startstop(struct request_queue *q, int start) +static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; struct blk_trace *bt = q->blk_trace; @@ -646,6 +669,17 @@ int blk_trace_startstop(struct request_queue *q, int start) return ret; } + +int blk_trace_startstop(struct request_queue *q, int start) +{ + int ret; + + mutex_lock(&q->blk_trace_mutex); + ret = __blk_trace_startstop(q, start); + mutex_unlock(&q->blk_trace_mutex); + + return ret; +} EXPORT_SYMBOL_GPL(blk_trace_startstop); /* @@ -676,7 +710,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: @@ -687,10 +721,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) case BLKTRACESTART: start = 1; case BLKTRACESTOP: - ret = blk_trace_startstop(q, start); + ret = __blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: - ret = blk_trace_remove(q); + ret = __blk_trace_remove(q); break; default: ret = -ENOTTY; @@ -708,10 +742,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { + mutex_lock(&q->blk_trace_mutex); + if (q->blk_trace) { - blk_trace_startstop(q, 0); - blk_trace_remove(q); + __blk_trace_startstop(q, 0); + __blk_trace_remove(q); } + + mutex_unlock(&q->blk_trace_mutex); } #ifdef CONFIG_BLK_CGROUP -- cgit From a6da0024ffc19e0d47712bb5ca4fd083f76b07df Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 5 Nov 2017 09:16:09 -0700 Subject: blktrace: fix unlocked registration of tracepoints We need to ensure that tracepoints are registered and unregistered with the users of them. The existing atomic count isn't enough for that. Add a lock around the tracepoints, so we serialize access to them. This fixes cases where we have multiple users setting up and tearing down tracepoints, like this: CPU: 0 PID: 2995 Comm: syzkaller857118 Not tainted 4.14.0-rc5-next-20171018+ #36 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x41c kernel/panic.c:183 __warn+0x1c4/0x1e0 kernel/panic.c:546 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:177 do_trap_no_signal arch/x86/kernel/traps.c:211 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:260 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:297 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:310 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:tracepoint_add_func kernel/tracepoint.c:210 [inline] RIP: 0010:tracepoint_probe_register_prio+0x397/0x9a0 kernel/tracepoint.c:283 RSP: 0018:ffff8801d1d1f6c0 EFLAGS: 00010293 RAX: ffff8801d22e8540 RBX: 00000000ffffffef RCX: ffffffff81710f07 RDX: 0000000000000000 RSI: ffffffff85b679c0 RDI: ffff8801d5f19818 RBP: ffff8801d1d1f7c8 R08: ffffffff81710c10 R09: 0000000000000004 R10: ffff8801d1d1f6b0 R11: 0000000000000003 R12: ffffffff817597f0 R13: 0000000000000000 R14: 00000000ffffffff R15: ffff8801d1d1f7a0 tracepoint_probe_register+0x2a/0x40 kernel/tracepoint.c:304 register_trace_block_rq_insert include/trace/events/block.h:191 [inline] blk_register_tracepoints+0x1e/0x2f0 kernel/trace/blktrace.c:1043 do_blk_trace_setup+0xa10/0xcf0 kernel/trace/blktrace.c:542 blk_trace_setup+0xbd/0x180 kernel/trace/blktrace.c:564 sg_ioctl+0xc71/0x2d90 drivers/scsi/sg.c:1089 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x444339 RSP: 002b:00007ffe05bb5b18 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000006d66c0 RCX: 0000000000444339 RDX: 000000002084cf90 RSI: 00000000c0481273 RDI: 0000000000000009 RBP: 0000000000000082 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000206 R12: ffffffffffffffff R13: 00000000c0481273 R14: 0000000000000000 R15: 0000000000000000 since we can now run these in parallel. Ensure that the exported helpers for doing this are grabbing the queue trace mutex. Reported-by: Steven Rostedt Tested-by: Dmitry Vyukov Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ea57dd94b2b2..206e0e2ace53 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = { }; /* Global reference count of probes */ -static atomic_t blk_probes_ref = ATOMIC_INIT(0); +static DEFINE_MUTEX(blk_probe_mutex); +static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); @@ -329,11 +330,26 @@ static void blk_trace_free(struct blk_trace *bt) kfree(bt); } +static void get_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (++blk_probes_ref == 1) + blk_register_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static void put_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (!--blk_probes_ref) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + static void blk_trace_cleanup(struct blk_trace *bt) { blk_trace_free(bt); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); + put_probe_ref(); } static int __blk_trace_remove(struct request_queue *q) @@ -549,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (cmpxchg(&q->blk_trace, NULL, bt)) goto err; - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); + get_probe_ref(); ret = 0; err: @@ -1596,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - + put_probe_ref(); blk_trace_free(bt); return 0; } @@ -1629,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q, if (cmpxchg(&q->blk_trace, NULL, bt)) goto free_bt; - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); + get_probe_ref(); return 0; free_bt: -- cgit From e10237cc76ef9a4066a84aa2cc710bfd708cc341 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 7 Nov 2017 11:09:50 -0800 Subject: kthread: zero the kthread data structure kthread() could bail out early before we initialize blkcg_css (if the kthread is killed very early. Please see xchg() statement in kthread()), which confuses free_kthread_struct. Instead of moving the blkcg_css initialization early, we simply zero the whole 'self' data structure, which doesn't sound much overhead. Reported-by: syzbot Fixes: 05e3db95ebfc ("kthread: add a mechanism to store cgroup info") Cc: Andrew Morton Cc: Ingo Molnar Cc: Dmitry Vyukov Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/kthread.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index f87cd8b4eb2a..8dbe2454cb1d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -204,7 +204,7 @@ static int kthread(void *_create) struct kthread *self; int ret; - self = kmalloc(sizeof(*self), GFP_KERNEL); + self = kzalloc(sizeof(*self), GFP_KERNEL); set_kthread_struct(self); /* If user was SIGKILLed, I release the structure. */ @@ -220,13 +220,9 @@ static int kthread(void *_create) do_exit(-ENOMEM); } - self->flags = 0; self->data = data; init_completion(&self->exited); init_completion(&self->parked); -#ifdef CONFIG_BLK_CGROUP - self->blkcg_css = NULL; -#endif current->vfork_done = &self->exited; /* OK, tell user we're spawned, wait for stop or wakeup */ -- cgit From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 7 Nov 2017 15:28:42 -0500 Subject: bpf: add a bpf_override_function helper Error injection is sloppy and very ad-hoc. BPF could fill this niche perfectly with it's kprobe functionality. We could make sure errors are only triggered in specific call chains that we care about with very specific situations. Accomplish this with the bpf_override_funciton helper. This will modify the probe'd callers return value to the specified value and set the PC to an override function that simply returns, bypassing the originally probed function. This gives us a nice clean way to implement systematic error injection for all of our code paths. Acked-by: Alexei Starovoitov Signed-off-by: Josef Bacik Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/core.c | 3 +++ kernel/bpf/verifier.c | 2 ++ kernel/events/core.c | 7 +++++++ kernel/trace/Kconfig | 11 +++++++++++ kernel/trace/bpf_trace.c | 35 +++++++++++++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 40 +++++++++++++++++++++++++++++++++------- kernel/trace/trace_probe.h | 6 ++++++ 7 files changed, 97 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8a6c37762330..271daad31f37 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1326,6 +1326,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + if (fp->kprobe_override) + return false; + if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4a942e2e753d..bc464b8ec91e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4357,6 +4357,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can diff --git a/kernel/events/core.c b/kernel/events/core.c index 42d24bd64ea4..ac240d31b5bf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,6 +8171,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 434c840e2d82..9dc0deeaad2b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -518,6 +518,17 @@ config FUNCTION_PROFILER If in doubt, say N. +config BPF_KPROBE_OVERRIDE + bool "Enable BPF programs to override a kprobed function" + depends on BPF_EVENTS + depends on KPROBES_ON_FTRACE + depends on HAVE_KPROBE_OVERRIDE + depends on DYNAMIC_FTRACE_WITH_REGS + default n + help + Allows BPF to override the execution of a probed function and + set a different return value. This is used for error injection. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 506efe6e8ed9..1865b0d4cdeb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,10 @@ #include #include #include +#include +#include + +#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -76,6 +80,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + __this_cpu_write(bpf_kprobe_override, 1); + regs_set_return_value(regs, rc); + arch_ftrace_kprobe_override_function(regs); + return 0; +} +#else +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + return -EINVAL; +} +#endif + +static const struct bpf_func_proto bpf_override_return_proto = { + .func = bpf_override_return, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -551,6 +578,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; + case BPF_FUNC_override_return: + pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!", + current->comm, task_pid_nr(current)); + return &bpf_override_return_proto; default: return tracing_func_proto(func_id); } @@ -766,6 +797,10 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; + /* Kprobe override only works for ftrace based kprobes. */ + if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event)) + return -EINVAL; + mutex_lock(&bpf_event_mutex); if (event->prog) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index abf92e478cfb..8e3c9ec1faf7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,6 +42,7 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) +DEFINE_PER_CPU(int, bpf_kprobe_override); static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -87,6 +88,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +int trace_kprobe_ftrace(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + return kprobe_ftrace(&tk->rp.kp); +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1170,7 +1177,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static void +static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1179,12 +1186,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + int ret; + + ret = trace_call_bpf(call, regs); + + /* + * We need to check and see if we modified the pc of the + * pt_regs, and if so clear the kprobe and return 1 so that we + * don't do the instruction skipping. Also reset our state so + * we are clean the next pass through. + */ + if (__this_cpu_read(bpf_kprobe_override)) { + __this_cpu_write(bpf_kprobe_override, 0); + reset_current_kprobe(); + return 1; + } + if (!ret) + return 0; + } head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return; + return 0; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1193,13 +1217,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return; + return 0; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL, NULL); + return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1275,6 +1300,7 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1282,9 +1308,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - kprobe_perf_func(tk, regs); + ret = kprobe_perf_func(tk, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return ret; } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 903273c93e61..adbb3f7d1fb5 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -253,6 +253,7 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +int trace_kprobe_ftrace(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -278,6 +279,11 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } + +static inline int trace_kprobe_ftrace(struct trace_event_call *call) +{ + return 0; +} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { -- cgit From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 11 Nov 2017 18:24:55 +0900 Subject: bpf: Revert bpf_overrid_function() helper changes. NACK'd by x86 maintainer. Signed-off-by: David S. Miller --- kernel/bpf/core.c | 3 --- kernel/bpf/verifier.c | 2 -- kernel/events/core.c | 7 ------- kernel/trace/Kconfig | 11 ----------- kernel/trace/bpf_trace.c | 35 ----------------------------------- kernel/trace/trace_kprobe.c | 40 +++++++--------------------------------- kernel/trace/trace_probe.h | 6 ------ 7 files changed, 7 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 271daad31f37..8a6c37762330 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1326,9 +1326,6 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { - if (fp->kprobe_override) - return false; - if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bc464b8ec91e..4a942e2e753d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4357,8 +4357,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); - if (insn->imm == BPF_FUNC_override_return) - prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can diff --git a/kernel/events/core.c b/kernel/events/core.c index ac240d31b5bf..42d24bd64ea4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8171,13 +8171,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } - /* Kprobe override only works for kprobes, not uprobes. */ - if (prog->kprobe_override && - !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { - bpf_prog_put(prog); - return -EINVAL; - } - if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9dc0deeaad2b..434c840e2d82 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -518,17 +518,6 @@ config FUNCTION_PROFILER If in doubt, say N. -config BPF_KPROBE_OVERRIDE - bool "Enable BPF programs to override a kprobed function" - depends on BPF_EVENTS - depends on KPROBES_ON_FTRACE - depends on HAVE_KPROBE_OVERRIDE - depends on DYNAMIC_FTRACE_WITH_REGS - default n - help - Allows BPF to override the execution of a probed function and - set a different return value. This is used for error injection. - config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1865b0d4cdeb..506efe6e8ed9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,10 +13,6 @@ #include #include #include -#include -#include - -#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -80,29 +76,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) -{ - __this_cpu_write(bpf_kprobe_override, 1); - regs_set_return_value(regs, rc); - arch_ftrace_kprobe_override_function(regs); - return 0; -} -#else -BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) -{ - return -EINVAL; -} -#endif - -static const struct bpf_func_proto bpf_override_return_proto = { - .func = bpf_override_return, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, -}; - BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -578,10 +551,6 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; - case BPF_FUNC_override_return: - pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!", - current->comm, task_pid_nr(current)); - return &bpf_override_return_proto; default: return tracing_func_proto(func_id); } @@ -797,10 +766,6 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; - /* Kprobe override only works for ftrace based kprobes. */ - if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event)) - return -EINVAL; - mutex_lock(&bpf_event_mutex); if (event->prog) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8e3c9ec1faf7..abf92e478cfb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,7 +42,6 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -DEFINE_PER_CPU(int, bpf_kprobe_override); static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -88,12 +87,6 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } -int trace_kprobe_ftrace(struct trace_event_call *call) -{ - struct trace_kprobe *tk = (struct trace_kprobe *)call->data; - return kprobe_ftrace(&tk->rp.kp); -} - static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1177,7 +1170,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static int +static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1186,29 +1179,12 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call)) { - int ret; - - ret = trace_call_bpf(call, regs); - - /* - * We need to check and see if we modified the pc of the - * pt_regs, and if so clear the kprobe and return 1 so that we - * don't do the instruction skipping. Also reset our state so - * we are clean the next pass through. - */ - if (__this_cpu_read(bpf_kprobe_override)) { - __this_cpu_write(bpf_kprobe_override, 0); - reset_current_kprobe(); - return 1; - } - if (!ret) - return 0; - } + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) + return; head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return 0; + return; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1217,14 +1193,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return 0; + return; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL, NULL); - return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1300,7 +1275,6 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1308,9 +1282,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - ret = kprobe_perf_func(tk, regs); + kprobe_perf_func(tk, regs); #endif - return ret; + return 0; /* We don't tweek kernel, so just return 0 */ } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index adbb3f7d1fb5..903273c93e61 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -253,7 +253,6 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); -int trace_kprobe_ftrace(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -279,11 +278,6 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } - -static inline int trace_kprobe_ftrace(struct trace_event_call *call) -{ - return 0; -} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { -- cgit From 5e4def20381678ba3ce0a4e117f97e378ecd81bc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Nov 2017 15:27:44 +0000 Subject: Pass mode to wait_on_atomic_t() action funcs and provide default actions Make wait_on_atomic_t() pass the TASK_* mode onto its action function as an extra argument and make it 'unsigned int throughout. Also, consolidate a bunch of identical action functions into a default function that can do the appropriate thing for the mode. Also, change the argument name in the bit_wait*() function declarations to reflect the fact that it's the mode and not the bit number. [Peter Z gives this a grudging ACK, but thinks that the whole atomic_t wait should be done differently, though he's not immediately sure as to how] Signed-off-by: David Howells Acked-by: Peter Zijlstra cc: Ingo Molnar --- kernel/sched/wait_bit.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index f8159698aa4d..84cb3acd9260 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -183,7 +183,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo */ static __sched int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, - int (*action)(atomic_t *), unsigned mode) + wait_atomic_t_action_f action, unsigned int mode) { atomic_t *val; int ret = 0; @@ -193,7 +193,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en val = wbq_entry->key.flags; if (atomic_read(val) == 0) break; - ret = (*action)(val); + ret = (*action)(val, mode); } while (!ret && atomic_read(val) != 0); finish_wait(wq_head, &wbq_entry->wq_entry); return ret; @@ -210,8 +210,9 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en }, \ } -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), - unsigned mode) +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, + wait_atomic_t_action_f action, + unsigned int mode) { struct wait_queue_head *wq_head = atomic_t_waitqueue(p); DEFINE_WAIT_ATOMIC_T(wq_entry, p); @@ -220,6 +221,15 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), } EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); +__sched int atomic_t_wait(atomic_t *counter, unsigned int mode) +{ + schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; + return 0; +} +EXPORT_SYMBOL(atomic_t_wait); + /** * wake_up_atomic_t - Wake up a waiter on a atomic_t * @p: The atomic_t being waited on, a kernel virtual address -- cgit From 9fd29c08e52023252f0480ab8f6906a1ecc9a8d5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 12 Nov 2017 14:49:09 -0800 Subject: bpf: improve verifier ARG_CONST_SIZE_OR_ZERO semantics For helpers, the argument type ARG_CONST_SIZE_OR_ZERO permits the access size to be 0 when accessing the previous argument (arg). Right now, it requires the arg needs to be NULL when size passed is 0 or could be 0. It also requires a non-NULL arg when the size is proved to be non-0. This patch changes verifier ARG_CONST_SIZE_OR_ZERO behavior such that for size-0 or possible size-0, it is not required the arg equal to NULL. There are a couple of reasons for this semantics change, and all of them intends to simplify user bpf programs which may improve user experience and/or increase chances of verifier acceptance. Together with the next patch which changes bpf_probe_read arg2 type from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO, the following two examples, which fail the verifier currently, are able to get verifier acceptance. Example 1: unsigned long len = pend - pstart; len = len > MAX_PAYLOAD_LEN ? MAX_PAYLOAD_LEN : len; len &= MAX_PAYLOAD_LEN; bpf_probe_read(data->payload, len, pstart); It does not have test for "len > 0" and it failed the verifier. Users may not be aware that they have to add this test. Converting the bpf_probe_read helper to have ARG_CONST_SIZE_OR_ZERO helps the above code get verifier acceptance. Example 2: Here is one example where llvm "messed up" the code and the verifier fails. ...... unsigned long len = pend - pstart; if (len > 0 && len <= MAX_PAYLOAD_LEN) bpf_probe_read(data->payload, len, pstart); ...... The compiler generates the following code and verifier fails: ...... 39: (79) r2 = *(u64 *)(r10 -16) 40: (1f) r2 -= r8 41: (bf) r1 = r2 42: (07) r1 += -1 43: (25) if r1 > 0xffe goto pc+3 R0=inv(id=0) R1=inv(id=0,umax_value=4094,var_off=(0x0; 0xfff)) R2=inv(id=0) R6=map_value(id=0,off=0,ks=4,vs=4095,imm=0) R7=inv(id=0) R8=inv(id=0) R9=inv0 R10=fp0 44: (bf) r1 = r6 45: (bf) r3 = r8 46: (85) call bpf_probe_read#45 R2 min value is negative, either use unsigned or 'var &= const' ...... The compiler optimization is correct. If r1 = 0, r1 - 1 = 0xffffffffffffffff > 0xffe. If r1 != 0, r1 - 1 will not wrap. r1 > 0xffe at insn #43 can actually capture both "r1 > 0" and "len <= MAX_PAYLOAD_LEN". This however causes an issue in verifier as the value range of arg2 "r2" does not properly get refined and lead to verification failure. Relaxing bpf_prog_read arg2 from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO allows the following simplied code: unsigned long len = pend - pstart; if (len <= MAX_PAYLOAD_LEN) bpf_probe_read(data->payload, len, pstart); The llvm compiler will generate less complex code and the verifier is able to verify that the program is okay. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4a942e2e753d..dd54d20ace2f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -799,12 +799,13 @@ static int check_stack_read(struct bpf_verifier_env *env, /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, - int size) + int size, bool zero_size_allowed) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_map *map = regs[regno].map_ptr; - if (off < 0 || size <= 0 || off + size > map->value_size) { + if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || + off + size > map->value_size) { verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; @@ -814,7 +815,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size, bool zero_size_allowed) { struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *reg = &state->regs[regno]; @@ -837,7 +838,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->smin_value + off, size); + err = __check_map_access(env, regno, reg->smin_value + off, size, + zero_size_allowed); if (err) { verbose(env, "R%d min value is outside of the array range\n", regno); @@ -853,7 +855,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, regno); return -EACCES; } - err = __check_map_access(env, regno, reg->umax_value + off, size); + err = __check_map_access(env, regno, reg->umax_value + off, size, + zero_size_allowed); if (err) verbose(env, "R%d max value is outside of the array range\n", regno); @@ -889,12 +892,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, } static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size, bool zero_size_allowed) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; - if (off < 0 || size <= 0 || (u64)off + size > reg->range) { + if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || + (u64)off + size > reg->range) { verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; @@ -903,7 +907,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, } static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, - int size) + int size, bool zero_size_allowed) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; @@ -922,7 +926,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_packet_access(env, regno, off, size); + err = __check_packet_access(env, regno, off, size, zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); return err; @@ -1097,7 +1101,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_map_access(env, regno, off, size); + err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); @@ -1184,7 +1188,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); return -EACCES; } - err = check_packet_access(env, regno, off, size); + err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { @@ -1281,7 +1285,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size <= 0) { + access_size < 0 || (access_size == 0 && !zero_size_allowed)) { verbose(env, "invalid stack type R%d off=%d access_size=%d\n", regno, off, access_size); return -EACCES; @@ -1319,9 +1323,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (reg->type) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, reg->off, access_size); + return check_packet_access(env, regno, reg->off, access_size, + zero_size_allowed); case PTR_TO_MAP_VALUE: - return check_map_access(env, regno, reg->off, access_size); + return check_map_access(env, regno, reg->off, access_size, + zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ return check_stack_boundary(env, regno, access_size, zero_size_allowed, meta); @@ -1415,7 +1421,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, - meta->map_ptr->key_size); + meta->map_ptr->key_size, + false); else err = check_stack_boundary(env, regno, meta->map_ptr->key_size, @@ -1431,7 +1438,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (type_is_pkt_pointer(type)) err = check_packet_access(env, regno, reg->off, - meta->map_ptr->value_size); + meta->map_ptr->value_size, + false); else err = check_stack_boundary(env, regno, meta->map_ptr->value_size, -- cgit From 9c019e2bc4b2bd8223c8c0d4b6962478b479834d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 12 Nov 2017 14:49:10 -0800 Subject: bpf: change helper bpf_probe_read arg2 type to ARG_CONST_SIZE_OR_ZERO The helper bpf_probe_read arg2 type is changed from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO to permit size-0 buffer. Together with newer ARG_CONST_SIZE_OR_ZERO semantics which allows non-NULL buffer with size 0, this allows simpler bpf programs with verifier acceptance. The previous commit which changes ARG_CONST_SIZE_OR_ZERO semantics has details on examples. Signed-off-by: Yonghong Song Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 506efe6e8ed9..a5580c670866 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -78,12 +78,16 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { - int ret; + int ret = 0; + + if (unlikely(size == 0)) + goto out; ret = probe_kernel_read(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); + out: return ret; } @@ -92,7 +96,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, - .arg2_type = ARG_CONST_SIZE, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; -- cgit From 89ad2fa3f043a1e8daae193bcb5fe34d5f8caf28 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Nov 2017 17:15:50 -0800 Subject: bpf: fix lockdep splat pcpu_freelist_pop() needs the same lockdep awareness than pcpu_freelist_populate() to avoid a false positive. [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ] switchto-defaul/12508 [HC0[0]:SC0[6]:HE0:SE0] is trying to acquire: (&htab->buckets[i].lock){......}, at: [] __htab_percpu_map_update_elem+0x1cb/0x300 and this task is already holding: (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...}, at: [] __dev_queue_xmit+0 x868/0x1240 which would create a new lock dependency: (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...} -> (&htab->buckets[i].lock){......} but this new dependency connects a SOFTIRQ-irq-safe lock: (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...} ... which became SOFTIRQ-irq-safe at: [] __lock_acquire+0x42b/0x1f10 [] lock_acquire+0xbc/0x1b0 [] _raw_spin_lock+0x38/0x50 [] __dev_queue_xmit+0x868/0x1240 [] dev_queue_xmit+0x10/0x20 [] ip_finish_output2+0x439/0x590 [] ip_finish_output+0x150/0x2f0 [] ip_output+0x7d/0x260 [] ip_local_out+0x5e/0xe0 [] ip_queue_xmit+0x205/0x620 [] tcp_transmit_skb+0x5a8/0xcb0 [] tcp_write_xmit+0x242/0x1070 [] __tcp_push_pending_frames+0x3c/0xf0 [] tcp_rcv_established+0x312/0x700 [] tcp_v4_do_rcv+0x11c/0x200 [] tcp_v4_rcv+0xaa2/0xc30 [] ip_local_deliver_finish+0xa7/0x240 [] ip_local_deliver+0x66/0x200 [] ip_rcv_finish+0xdd/0x560 [] ip_rcv+0x295/0x510 [] __netif_receive_skb_core+0x988/0x1020 [] __netif_receive_skb+0x21/0x70 [] process_backlog+0x6f/0x230 [] net_rx_action+0x229/0x420 [] __do_softirq+0xd8/0x43d [] do_softirq_own_stack+0x1c/0x30 [] do_softirq+0x55/0x60 [] __local_bh_enable_ip+0xa8/0xb0 [] cpu_startup_entry+0x1c7/0x500 [] start_secondary+0x113/0x140 to a SOFTIRQ-irq-unsafe lock: (&head->lock){+.+...} ... which became SOFTIRQ-irq-unsafe at: ... [] __lock_acquire+0x82f/0x1f10 [] lock_acquire+0xbc/0x1b0 [] _raw_spin_lock+0x38/0x50 [] pcpu_freelist_pop+0x7a/0xb0 [] htab_map_alloc+0x50c/0x5f0 [] SyS_bpf+0x265/0x1200 [] entry_SYSCALL_64_fastpath+0x12/0x17 other info that might help us debug this: Chain exists of: dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2 --> &htab->buckets[i].lock --> &head->lock Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&head->lock); local_irq_disable(); lock(dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2); lock(&htab->buckets[i].lock); lock(dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2); *** DEADLOCK *** Fixes: e19494edab82 ("bpf: introduce percpu_freelist") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- kernel/bpf/percpu_freelist.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 5c51d1985b51..673fa6fe2d73 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; + unsigned long flags; int orig_cpu, cpu; + local_irq_save(flags); orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); @@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) node = head->first; if (node) { head->first = node->next; - raw_spin_unlock(&head->lock); + raw_spin_unlock_irqrestore(&head->lock, flags); return node; } raw_spin_unlock(&head->lock); cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; - if (cpu == orig_cpu) + if (cpu == orig_cpu) { + local_irq_restore(flags); return NULL; + } } } -- cgit From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:33 -0800 Subject: mm: account pud page tables On a machine with 5-level paging support a process can allocate significant amount of memory and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PUD page tables. We don't account PUD page tables, only PMD and PTE. We already addressed the same issue for PMD page tables, see commit dc6c9a35b66b ("mm: account pmd page tables to the process"). Introduction of 5-level paging brings the same issue for PUD page tables. The patch expands accounting to PUD level. [kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/] Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com [heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting] Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Heiko Carstens Acked-by: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 07cc743698d3..a4eb6f289365 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); mm_nr_pmds_init(mm); + mm_nr_puds_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm) if (mm_nr_pmds(mm)) pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", mm_nr_pmds(mm)); + if (mm_nr_puds(mm)) + pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n", + mm_nr_puds(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -- cgit From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:37 -0800 Subject: mm: introduce wrappers to access mm->nr_ptes Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd and nr_pud. The patch also makes nr_ptes accounting dependent onto CONFIG_MMU. Page table accounting doesn't make sense if you don't have page tables. It's preparation for consolidation of page-table counters in mm_struct. Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a4eb6f289365..946922a30ede 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,7 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - atomic_long_set(&mm->nr_ptes, 0); + mm_nr_ptes_init(mm); mm_nr_pmds_init(mm); mm_nr_puds_init(mm); mm->map_count = 0; @@ -873,9 +873,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (atomic_long_read(&mm->nr_ptes)) + if (mm_nr_ptes(mm)) pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - atomic_long_read(&mm->nr_ptes)); + mm_nr_ptes(mm)); if (mm_nr_pmds(mm)) pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", mm_nr_pmds(mm)); -- cgit From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:40 -0800 Subject: mm: consolidate page table accounting Currently, we account page tables separately for each page table level, but that's redundant -- we only make use of total memory allocated to page tables for oom_badness calculation. We also provide the information to userspace, but it has dubious value there too. This patch switches page table accounting to single counter. mm->pgtables_bytes is now used to account all page table levels. We use bytes, because page table size for different levels of page table tree may be different. The change has user-visible effect: we don't have VmPMD and VmPUD reported in /proc/[pid]/status. Not sure if anybody uses them. (As alternative, we can always report 0 kB for them.) OOM-killer report is also slightly changed: we now report pgtables_bytes instead of nr_ptes, nr_pmd, nr_puds. Apart from reducing number of counters per-mm, the benefit is that we now calculate oom_badness() more correctly for machines which have different size of page tables depending on level or where page tables are less than a page in size. The only downside can be debuggability because we do not know which page table level could leak. But I do not remember many bugs that would be caught by separate counters so I wouldn't lose sleep over this. [akpm@linux-foundation.org: fix mm/huge_memory.c] Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko [kirill.shutemov@linux.intel.com: fix build] Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 946922a30ede..006dc5899a1a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,9 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - mm_nr_ptes_init(mm); - mm_nr_pmds_init(mm); - mm_nr_puds_init(mm); + mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -873,15 +871,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (mm_nr_ptes(mm)) - pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - mm_nr_ptes(mm)); - if (mm_nr_pmds(mm)) - pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", - mm_nr_pmds(mm)); - if (mm_nr_puds(mm)) - pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n", - mm_nr_puds(mm)); + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -- cgit From 4950276672fce5c241857540f8561c440663673d Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:51 -0800 Subject: kmemcheck: remove annotations Patch series "kmemcheck: kill kmemcheck", v2. As discussed at LSF/MM, kill kmemcheck. KASan is a replacement that is able to work without the limitation of kmemcheck (single CPU, slow). KASan is already upstream. We are also not aware of any users of kmemcheck (or users who don't consider KASan as a suitable replacement). The only objection was that since KASAN wasn't supported by all GCC versions provided by distros at that time we should hold off for 2 years, and try again. Now that 2 years have passed, and all distros provide gcc that supports KASAN, kill kmemcheck again for the very same reasons. This patch (of 4): Remove kmemcheck annotations, and calls to kmemcheck from the kernel. [alexander.levin@verizon.com: correctly remove kmemcheck call from dma_map_sg_attrs] Link: http://lkml.kernel.org/r/20171012192151.26531-1-alexander.levin@verizon.com Link: http://lkml.kernel.org/r/20171007030159.22241-2-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/bpf/core.c | 6 ------ kernel/locking/lockdep.c | 3 --- kernel/trace/ring_buffer.c | 3 --- 3 files changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7b62df86be1d..11ad089f2c74 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - kmemcheck_annotate_bitfield(fp, meta); - aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); if (aux == NULL) { vfree(fp); @@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { - kmemcheck_annotate_bitfield(fp, meta); - memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; @@ -662,8 +658,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); if (fp != NULL) { - kmemcheck_annotate_bitfield(fp, meta); - /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, * this still needs to be adapted. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index db933d063bfc..9776da8db180 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include @@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, { int i; - kmemcheck_mark_initialized(lock, sizeof(*lock)); - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) lock->class_cache[i] = NULL; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 845f3805c73d..d57fede84b38 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -13,7 +13,6 @@ #include #include #include /* for self test */ -#include #include #include #include @@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, } event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); /* account for padding bytes */ local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); @@ -2686,7 +2684,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); -- cgit From 75f296d93bcebcfe375884ddac79e30263a31766 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:54 -0800 Subject: kmemcheck: stop using GFP_NOTRACK and SLAB_NOTRACK Convert all allocations that used a NOTRACK flag to stop using it. Link: http://lkml.kernel.org/r/20171007030159.22241-3-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 12 ++++++------ kernel/signal.c | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 006dc5899a1a..4e55eedba8d6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -469,7 +469,7 @@ void __init fork_init(void) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", arch_task_struct_size, align, - SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); + SLAB_PANIC|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -2205,18 +2205,18 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| - SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); + SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the @@ -2227,7 +2227,7 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); diff --git a/kernel/signal.c b/kernel/signal.c index 8dcd8825b2de..aa1fb9f905db 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, else override_rlimit = 0; - q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, - override_rlimit); + q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { -- cgit From 4675ff05de2d76d167336b368bd07f3fef6ed5a6 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:36:02 -0800 Subject: kmemcheck: rip it out Fix up makefiles, remove references, and git rm kmemcheck. Link: http://lkml.kernel.org/r/20171007030159.22241-4-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Steven Rostedt Cc: Vegard Nossum Cc: Pekka Enberg Cc: Michal Hocko Cc: Eric W. Biederman Cc: Alexander Potapenko Cc: Tim Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 10 ---------- kernel/sysctl.c | 10 ---------- 2 files changed, 20 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 662f7b1b7a78..2f5e87f1bae2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule); -void __tasklet_hi_schedule_first(struct tasklet_struct *t) -{ - lockdep_assert_irqs_disabled(); - - t->next = __this_cpu_read(tasklet_hi_vec.head); - __this_cpu_write(tasklet_hi_vec.head, t); - __raise_softirq_irqoff(HI_SOFTIRQ); -} -EXPORT_SYMBOL(__tasklet_hi_schedule_first); - static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9576bd582d4a..7638e2f7fff8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -1173,15 +1172,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_thousand, }, -#endif -#ifdef CONFIG_KMEMCHECK - { - .procname = "kmemcheck", - .data = &kmemcheck_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif { .procname = "panic_on_warn", -- cgit From 453f85d43fa9ee243f0fc3ac4e1be45615301e3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:38:03 -0800 Subject: mm: remove __GFP_COLD As the page free path makes no distinction between cache hot and cold pages, there is no real useful ordering of pages in the free list that allocation requests can take advantage of. Juding from the users of __GFP_COLD, it is likely that a number of them are the result of copying other sites instead of actually measuring the impact. Remove the __GFP_COLD parameter which simplifies a number of paths in the page allocator. This is potentially controversial but bear in mind that the size of the per-cpu pagelists versus modern cache sizes means that the whole per-cpu list can often fit in the L3 cache. Hence, there is only a potential benefit for microbenchmarks that alloc/free pages in a tight loop. It's even worse when THP is taken into account which has little or no chance of getting a cache-hot page as the per-cpu list is bypassed and the zeroing of multiple pages will thrash the cache anyway. The truncate microbenchmarks are not shown as this patch affects the allocation path and not the free path. A page fault microbenchmark was tested but it showed no sigificant difference which is not surprising given that the __GFP_COLD branches are a miniscule percentage of the fault path. Link: http://lkml.kernel.org/r/20171018075952.10627-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a917a301e201..bce0464524d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1884,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) */ static inline int get_highmem_buffer(int safe_needed) { - buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + buffer = get_image_page(GFP_ATOMIC, safe_needed); return buffer ? 0 : -ENOMEM; } @@ -1945,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm, while (nr_pages-- > 0) { struct page *page; - page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_image_page(GFP_ATOMIC); if (!page) goto err_out; memory_bm_set_bit(copy_bm, page_to_pfn(page)); -- cgit From 4518085e127dff97e74f74a8780d7564e273bec8 Mon Sep 17 00:00:00 2001 From: Kemi Wang Date: Wed, 15 Nov 2017 17:38:22 -0800 Subject: mm, sysctl: make NUMA stats configurable This is the second step which introduces a tunable interface that allow numa stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. ========================================================================= When page allocation performance becomes a bottleneck and you can tolerate some possible tool breakage and decreased numa counter precision, you can do: echo 0 > /proc/sys/vm/numa_stat In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer (increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench ========================================================================= When page allocation performance is not a bottleneck and you want all tooling to work, you can do: echo 1 > /proc/sys/vm/numa_stat This is system default setting. Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka for comments to help improve the original patch. [keescook@chromium.org: make sure mutex is a global static] Link: http://lkml.kernel.org/r/20171107213809.GA4314@beast Link: http://lkml.kernel.org/r/1508290927-8518-1-git-send-email-kemi.wang@intel.com Signed-off-by: Kemi Wang Signed-off-by: Kees Cook Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Luis R . Rodriguez" Cc: Kees Cook Cc: Jonathan Corbet Cc: Mel Gorman Cc: Johannes Weiner Cc: Christopher Lameter Cc: Sebastian Andrzej Siewior Cc: Andrey Ryabinin Cc: Tim Chen Cc: Andi Kleen Cc: Aaron Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7638e2f7fff8..4a13a389e99b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1356,6 +1356,15 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = &zero, + .extra2 = &one, + }, #endif { .procname = "hugetlb_shm_group", -- cgit From b1fca27d384e8418aac84b39f6f5179aecc1b64f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 17 Nov 2017 15:27:03 -0800 Subject: kernel debug: support resetting WARN*_ONCE I like _ONCE warnings because it's guaranteed that they don't flood the log. During testing I find it useful to reset the state of the once warnings, so that I can rerun tests and see if they trigger again, or can guarantee that a test run always hits the same warnings. This patch adds a debugfs interface to reset all the _ONCE warnings so that they appear again: echo 1 > /sys/kernel/debug/clear_warn_once This is implemented by putting all the warning booleans into a special section, and clearing it. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20171017221455.6740-1-andi@firstfloor.org Signed-off-by: Andi Kleen Tested-by: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index bdd18afa19a4..672a91dc20fe 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -587,6 +589,32 @@ void warn_slowpath_null(const char *file, int line) EXPORT_SYMBOL(warn_slowpath_null); #endif +#ifdef CONFIG_BUG + +/* Support resetting WARN*_ONCE state */ + +static int clear_warn_once_set(void *data, u64 val) +{ + memset(__start_once, 0, __end_once - __start_once); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, + NULL, + clear_warn_once_set, + "%lld\n"); + +static __init int register_warn_debugfs(void) +{ + /* Don't care about failure */ + debugfs_create_file("clear_warn_once", 0644, NULL, + NULL, &clear_warn_once_fops); + return 0; +} + +device_initcall(register_warn_debugfs); +#endif + #ifdef CONFIG_CC_STACKPROTECTOR /* -- cgit From aaf5dcfb223617ac2d16113e4b500199c65689de Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 17 Nov 2017 15:27:06 -0800 Subject: kernel debug: support resetting WARN_ONCE for all architectures Some architectures store the WARN_ONCE state in the flags field of the bug_entry. Clear that one too when resetting once state through /sys/kernel/debug/clear_warn_once Pointed out by Michael Ellerman Improves the earlier patch that add clear_warn_once. [ak@linux.intel.com: add a missing ifdef CONFIG_MODULES] Link: http://lkml.kernel.org/r/20171020170633.9593-1-andi@firstfloor.org [akpm@linux-foundation.org: fix unused var warning] [akpm@linux-foundation.org: Use 0200 for clear_warn_once file, per mpe] [akpm@linux-foundation.org: clear BUGFLAG_DONE in clear_once_table(), per mpe] Link: http://lkml.kernel.org/r/20171019204642.7404-1-andi@firstfloor.org Signed-off-by: Andi Kleen Tested-by: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 672a91dc20fe..67cebf2a3b67 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -595,6 +595,7 @@ EXPORT_SYMBOL(warn_slowpath_null); static int clear_warn_once_set(void *data, u64 val) { + generic_bug_clear_once(); memset(__start_once, 0, __end_once - __start_once); return 0; } @@ -607,7 +608,7 @@ DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, static __init int register_warn_debugfs(void) { /* Don't care about failure */ - debugfs_create_file("clear_warn_once", 0644, NULL, + debugfs_create_file("clear_warn_once", 0200, NULL, NULL, &clear_warn_once_fops); return 0; } -- cgit From 2a8358d8a339540f00ec596526690e8eeca931a3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Nov 2017 15:27:21 -0800 Subject: bug: define the "cut here" string in a single place The "cut here" string is used in a few paths. Define it in a single place. Link: http://lkml.kernel.org/r/1510100869-73751-3-git-send-email-keescook@chromium.org Signed-off-by: Kees Cook Cc: Arnd Bergmann Cc: Fengguang Wu Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 67cebf2a3b67..89df5fa2f798 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -520,7 +520,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - pr_warn("------------[ cut here ]------------\n"); + pr_warn(CUT_HERE); if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", -- cgit From a7bed27af194aa3f67915688039d93188ed95e2a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 17 Nov 2017 15:27:24 -0800 Subject: bug: fix "cut here" location for __WARN_TAINT architectures Prior to v4.11, x86 used warn_slowpath_fmt() for handling WARN()s. After WARN() was moved to using UD0 on x86, the warning text started appearing _before_ the "cut here" line. This appears to have been a long-standing bug on architectures that used __WARN_TAINT, but it didn't get fixed. v4.11 and earlier on x86: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 2956 at drivers/misc/lkdtm_bugs.c:65 lkdtm_WARNING+0x21/0x30 This is a warning message Modules linked in: v4.12 and later on x86: This is a warning message ------------[ cut here ]------------ WARNING: CPU: 1 PID: 2982 at drivers/misc/lkdtm_bugs.c:68 lkdtm_WARNING+0x15/0x20 Modules linked in: With this fix: ------------[ cut here ]------------ This is a warning message WARNING: CPU: 3 PID: 3009 at drivers/misc/lkdtm_bugs.c:67 lkdtm_WARNING+0x15/0x20 Since the __FILE__ reporting happens as part of the UD0 handler, it isn't trivial to move the message to after the WARNING line, but at least we can fix the position of the "cut here" line so all the various logging tools will start including the actual runtime warning message again, when they follow the instruction and "cut here". Link: http://lkml.kernel.org/r/1510100869-73751-4-git-send-email-keescook@chromium.org Fixes: 9a93848fe787 ("x86/debug: Implement __WARN() using UD0") Signed-off-by: Kees Cook Cc: Peter Zijlstra (Intel) Cc: Josh Poimboeuf Cc: Fengguang Wu Cc: Arnd Bergmann Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 89df5fa2f798..3242b64b1956 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -520,7 +520,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, { disable_trace_on_warning(); - pr_warn(CUT_HERE); + if (args) + pr_warn(CUT_HERE); if (file) pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", @@ -584,9 +585,22 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint); void warn_slowpath_null(const char *file, int line) { + pr_warn(CUT_HERE); __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); } EXPORT_SYMBOL(warn_slowpath_null); +#else +void __warn_printk(const char *fmt, ...) +{ + va_list args; + + pr_warn(CUT_HERE); + + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); +} +EXPORT_SYMBOL(__warn_printk); #endif #ifdef CONFIG_BUG -- cgit From 8c703d660450c4df72ff24f79a335dc7973a9dc8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 17 Nov 2017 15:27:32 -0800 Subject: kernel/umh.c: optimize 'proc_cap_handler()' If 'write' is 0, we can avoid a call to spin_lock/spin_unlock. Link: http://lkml.kernel.org/r/20171020193331.7233-1-christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Acked-by: Luis R. Rodriguez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/umh.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/umh.c b/kernel/umh.c index 6ff9905250ff..18e5fa4b0e71 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -537,14 +537,14 @@ static int proc_cap_handler(struct ctl_table *table, int write, /* * Drop everything not in the new_cap (but don't add things) */ - spin_lock(&umh_sysctl_lock); if (write) { + spin_lock(&umh_sysctl_lock); if (table->data == CAP_BSET) usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); if (table->data == CAP_PI) usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + spin_unlock(&umh_sysctl_lock); } - spin_unlock(&umh_sysctl_lock); return 0; } -- cgit From 98159d977f71c3b3dee898d1c34e56f520b094e7 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:17 -0800 Subject: pipe: match pipe_max_size data type with procfs Patch series "A few round_pipe_size() and pipe-max-size fixups", v3. While backporting Michael's "pipe: fix limit handling" patchset to a distro-kernel, Mikulas noticed that current upstream pipe limit handling contains a few problems: 1 - procfs signed wrap: echo'ing a large number into /proc/sys/fs/pipe-max-size and then cat'ing it back out shows a negative value. 2 - round_pipe_size() nr_pages overflow on 32bit: this would subsequently try roundup_pow_of_two(0), which is undefined. 3 - visible non-rounded pipe-max-size value: there is no mutual exclusion or protection between the time pipe_max_size is assigned a raw value from proc_dointvec_minmax() and when it is rounded. 4 - unsigned long -> unsigned int conversion makes for potential odd return errors from do_proc_douintvec_minmax_conv() and do_proc_dopipe_max_size_conv(). This version underwent the same testing as v1: https://marc.info/?l=linux-kernel&m=150643571406022&w=2 This patch (of 4): pipe_max_size is defined as an unsigned int: unsigned int pipe_max_size = 1048576; but its procfs/sysctl representation is an integer: static struct ctl_table fs_table[] = { ... { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, ... that is signed: int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { ... ret = proc_dointvec_minmax(table, write, buf, lenp, ppos) This leads to signed results via procfs for large values of pipe_max_size: % echo 2147483647 >/proc/sys/fs/pipe-max-size % cat /proc/sys/fs/pipe-max-size -2147483648 Use unsigned operations on this variable to avoid such negative values. Link: http://lkml.kernel.org/r/1507658689-11669-2-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Al Viro Cc: Jens Axboe Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4a13a389e99b..2d42183b4c98 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1816,7 +1816,7 @@ static struct ctl_table fs_table[] = { { .procname = "pipe-max-size", .data = &pipe_max_size, - .maxlen = sizeof(int), + .maxlen = sizeof(pipe_max_size), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, -- cgit From 7a8d181949fb2c16be00f8cdb354794a30e46b39 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:24 -0800 Subject: pipe: add proc_dopipe_max_size() to safely assign pipe_max_size pipe_max_size is assigned directly via procfs sysctl: static struct ctl_table fs_table[] = { ... { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, ... int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { ... ret = proc_dointvec_minmax(table, write, buf, lenp, ppos) ... and then later rounded in-place a few statements later: ... pipe_max_size = round_pipe_size(pipe_max_size); ... This leaves a window of time between initial assignment and rounding that may be visible to other threads. (For example, one thread sets a non-rounded value to pipe_max_size while another reads its value.) Similar reads of pipe_max_size are potentially racy: pipe.c :: alloc_pipe_info() pipe.c :: pipe_set_size() Add a new proc_dopipe_max_size() that consolidates reading the new value from the user buffer, verifying bounds, and calling round_pipe_size() with a single assignment to pipe_max_size. Link: http://lkml.kernel.org/r/1507658689-11669-4-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Al Viro Cc: Jens Axboe Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2d42183b4c98..138b6484f277 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -2620,6 +2621,47 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, do_proc_douintvec_minmax_conv, ¶m); } +struct do_proc_dopipe_max_size_conv_param { + unsigned int *min; +}; + +static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + struct do_proc_dopipe_max_size_conv_param *param = data; + + if (write) { + unsigned int val = round_pipe_size(*lvalp); + + if (val == 0) + return -EINVAL; + + if (param->min && *param->min > val) + return -ERANGE; + + if (*lvalp > UINT_MAX) + return -EINVAL; + + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dopipe_max_size_conv_param param = { + .min = (unsigned int *) table->extra1, + }; + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_dopipe_max_size_conv, ¶m); +} + static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP @@ -3125,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3168,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); +EXPORT_SYMBOL_GPL(proc_dopipe_max_size); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); -- cgit From fb910c42ccebf853c29296185c45c11164a56098 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Fri, 17 Nov 2017 15:29:28 -0800 Subject: sysctl: check for UINT_MAX before unsigned int min/max Mikulas noticed in the existing do_proc_douintvec_minmax_conv() and do_proc_dopipe_max_size_conv() introduced in this patchset, that they inconsistently handle overflow and min/max range inputs: For example: 0 ... param->min - 1 ---> ERANGE param->min ... param->max ---> the value is accepted param->max + 1 ... 0x100000000L + param->min - 1 ---> ERANGE 0x100000000L + param->min ... 0x100000000L + param->max ---> EINVAL 0x100000000L + param->max + 1, 0x200000000L + param->min - 1 ---> ERANGE 0x200000000L + param->min ... 0x200000000L + param->max ---> EINVAL 0x200000000L + param->max + 1, 0x300000000L + param->min - 1 ---> ERANGE In do_proc_do*() routines which store values into unsigned int variables (4 bytes wide for 64-bit builds), first validate that the input unsigned long value (8 bytes wide for 64-bit builds) will fit inside the smaller unsigned int variable. Then check that the unsigned int value falls inside the specified parameter min, max range. Otherwise the unsigned long -> unsigned int conversion drops leading bits from the input value, leading to the inconsistent pattern Mikulas documented above. Link: http://lkml.kernel.org/r/1507658689-11669-5-git-send-email-joe.lawrence@redhat.com Signed-off-by: Joe Lawrence Reported-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Cc: Al Viro Cc: Jens Axboe Cc: Michael Kerrisk Cc: Randy Dunlap Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 138b6484f277..dd25d90896fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2576,12 +2576,13 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, if (write) { unsigned int val = *lvalp; + if (*lvalp > UINT_MAX) + return -EINVAL; + if ((param->min && *param->min > val) || (param->max && *param->max < val)) return -ERANGE; - if (*lvalp > UINT_MAX) - return -EINVAL; *valp = val; } else { unsigned int val = *valp; @@ -2632,17 +2633,18 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, struct do_proc_dopipe_max_size_conv_param *param = data; if (write) { - unsigned int val = round_pipe_size(*lvalp); + unsigned int val; + if (*lvalp > UINT_MAX) + return -EINVAL; + + val = round_pipe_size(*lvalp); if (val == 0) return -EINVAL; if (param->min && *param->min > val) return -ERANGE; - if (*lvalp > UINT_MAX) - return -EINVAL; - *valp = val; } else { unsigned int val = *valp; -- cgit From 628c1bcba204052d19b686b5bac149a644cdb72e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 17 Nov 2017 15:30:01 -0800 Subject: kernel/signal.c: protect the traced SIGNAL_UNKILLABLE tasks from SIGKILL The comment in sig_ignored() says "Tracers may want to know about even ignored signals" but SIGKILL can not be reported to debugger and it is just wrong to return 0 in this case: SIGKILL should only kill the SIGNAL_UNKILLABLE task if it comes from the parent ns. Change sig_ignored() to ignore ->ptrace if sig == SIGKILL and rely on sig_task_ignored(). SISGTOP coming from within the namespace is not really right too but at least debugger can intercept it, and we can't drop it here because this will break "gdb -p 1": ptrace_attach() won't work. Perhaps we will add another ->ptrace check later, we will see. Link: http://lkml.kernel.org/r/20171103184206.GB21036@redhat.com Signed-off-by: Oleg Nesterov Tested-by: Kyle Huey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index aa1fb9f905db..be5913134742 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -94,13 +94,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - if (!sig_task_ignored(t, sig, force)) - return 0; - /* - * Tracers may want to know about even ignored signals. + * Tracers may want to know about even ignored signal unless it + * is SIGKILL which can't be reported anyway but can be ignored + * by SIGNAL_UNKILLABLE task. */ - return !t->ptrace; + if (t->ptrace && sig != SIGKILL) + return 0; + + return sig_task_ignored(t, sig, force); } /* -- cgit From ac25385089f673560867eb5179228a44ade0cfc1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 17 Nov 2017 15:30:04 -0800 Subject: kernel/signal.c: protect the SIGNAL_UNKILLABLE tasks from !sig_kernel_only() signals Change sig_task_ignored() to drop the SIG_DFL && !sig_kernel_only() signals even if force == T. This simplifies the next change and this matches the same check in get_signal() which will drop these signals anyway. Link: http://lkml.kernel.org/r/20171103184227.GC21036@redhat.com Signed-off-by: Oleg Nesterov Tested-by: Kyle Huey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index be5913134742..01ba166a5e3a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -78,7 +78,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) handler = sig_handler(t, sig); if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !force) + handler == SIG_DFL && !(force && sig_kernel_only(sig))) return 1; return sig_handler_ignored(handler, sig); -- cgit From 426915796ccaf9c2bd9bb06dc5702225957bc2e5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 17 Nov 2017 15:30:08 -0800 Subject: kernel/signal.c: remove the no longer needed SIGNAL_UNKILLABLE check in complete_signal() complete_signal() checks SIGNAL_UNKILLABLE before it starts to destroy the thread group, today this is wrong in many ways. If nothing else, fatal_signal_pending() should always imply that the whole thread group (except ->group_exit_task if it is not NULL) is killed, this check breaks the rule. After the previous changes we can rely on sig_task_ignored(); sig_fatal(sig) && SIGNAL_UNKILLABLE can only be true if we actually want to kill this task and sig == SIGKILL OR it is traced and debugger can intercept the signal. This should hopefully fix the problem reported by Dmitry. This test-case static int init(void *arg) { for (;;) pause(); } int main(void) { char stack[16 * 1024]; for (;;) { int pid = clone(init, stack + sizeof(stack)/2, CLONE_NEWPID | SIGCHLD, NULL); assert(pid > 0); assert(ptrace(PTRACE_ATTACH, pid, 0, 0) == 0); assert(waitpid(-1, NULL, WSTOPPED) == pid); assert(ptrace(PTRACE_DETACH, pid, 0, SIGSTOP) == 0); assert(syscall(__NR_tkill, pid, SIGKILL) == 0); assert(pid == wait(NULL)); } } triggers the WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)) in task_participate_group_stop(). do_signal_stop()->signal_group_exit() checks SIGNAL_GROUP_EXIT and return false, but task_set_jobctl_pending() checks fatal_signal_pending() and does not set JOBCTL_STOP_PENDING. And his should fix the minor security problem reported by Kyle, SECCOMP_RET_TRACE can miss fatal_signal_pending() the same way if the task is the root of a pid namespace. Link: http://lkml.kernel.org/r/20171103184246.GD21036@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Dmitry Vyukov Reported-by: Kyle Huey Reviewed-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 01ba166a5e3a..6895f6bb98a7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -931,9 +931,9 @@ static void complete_signal(int sig, struct task_struct *p, int group) * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && - !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && + !(signal->flags & SIGNAL_GROUP_EXIT) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || !t->ptrace)) { + (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. */ -- cgit From de40ccefd1f19180c0a43e4d9b9d2f4dc8856c8b Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 17 Nov 2017 15:30:12 -0800 Subject: kdump: print a message in case parse_crashkernel_mem resulted in zero bytes parse_crashkernel_mem() silently returns if we get zero bytes in the parsing function. It is useful for debugging to add a message, especially if the kernel cannot boot correctly. Add a pr_info instead of pr_warn because it is expected behavior for size = 0, eg. crashkernel=2G-4G:128M, size will be 0 in case system memory is less than 2G. Link: http://lkml.kernel.org/r/20171114080129.GA6115@dhcp-128-65.nay.redhat.com Signed-off-by: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Bhupesh Sharma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 6db80fc0810b..b3663896278e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline, return -EINVAL; } } - } + } else + pr_info("crashkernel size resulted in zero bytes\n"); return 0; } -- cgit From f9eb2fdd04d4e68fbea18970bbf65ace716d25b6 Mon Sep 17 00:00:00 2001 From: "Ola N. Kaldestad" Date: Fri, 17 Nov 2017 15:30:26 -0800 Subject: kernel/sysctl.c: code cleanups Remove unnecessary else block, remove redundant return and call to kfree in if block. Link: http://lkml.kernel.org/r/1510238435-1655-1-git-send-email-mail@okal.no Signed-off-by: Ola N. Kaldestad Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dd25d90896fc..557d46728577 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3127,14 +3127,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, else bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } - kfree(tmp_bitmap); *lenp -= left; *ppos += *lenp; - return 0; - } else { - kfree(tmp_bitmap); - return err; } + + kfree(tmp_bitmap); + return err; } #else /* CONFIG_PROC_SYSCTL */ -- cgit From 95846ecf9dac5089aed4b144d912225f8ef86ae4 Mon Sep 17 00:00:00 2001 From: Gargi Sharma Date: Fri, 17 Nov 2017 15:30:30 -0800 Subject: pid: replace pid bitmap implementation with IDR API Patch series "Replacing PID bitmap implementation with IDR API", v4. This series replaces kernel bitmap implementation of PID allocation with IDR API. These patches are written to simplify the kernel by replacing custom code with calls to generic code. The following are the stats for pid and pid_namespace object files before and after the replacement. There is a noteworthy change between the IDR and bitmap implementation. Before text data bss dec hex filename 8447 3894 64 12405 3075 kernel/pid.o After text data bss dec hex filename 3397 304 0 3701 e75 kernel/pid.o Before text data bss dec hex filename 5692 1842 192 7726 1e2e kernel/pid_namespace.o After text data bss dec hex filename 2854 216 16 3086 c0e kernel/pid_namespace.o The following are the stats for ps, pstree and calling readdir on /proc for 10,000 processes. ps: With IDR API With bitmap real 0m1.479s 0m2.319s user 0m0.070s 0m0.060s sys 0m0.289s 0m0.516s pstree: With IDR API With bitmap real 0m1.024s 0m1.794s user 0m0.348s 0m0.612s sys 0m0.184s 0m0.264s proc: With IDR API With bitmap real 0m0.059s 0m0.074s user 0m0.000s 0m0.004s sys 0m0.016s 0m0.016s This patch (of 2): Replace the current bitmap implementation for Process ID allocation. Functions that are no longer required, for example, free_pidmap(), alloc_pidmap(), etc. are removed. The rest of the functions are modified to use the IDR API. The change was made to make the PID allocation less complex by replacing custom code with calls to generic API. [gs051095@gmail.com: v6] Link: http://lkml.kernel.org/r/1507760379-21662-2-git-send-email-gs051095@gmail.com [avagin@openvz.org: restore the old behaviour of the ns_last_pid sysctl] Link: http://lkml.kernel.org/r/20171106183144.16368-1-avagin@openvz.org Link: http://lkml.kernel.org/r/1507583624-22146-2-git-send-email-gs051095@gmail.com Signed-off-by: Gargi Sharma Reviewed-by: Rik van Riel Acked-by: Oleg Nesterov Cc: Julia Lawall Cc: Ingo Molnar Cc: Pavel Tatashin Cc: Kirill Tkhai Cc: Eric W. Biederman Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 201 +++++++++---------------------------------------- kernel/pid_namespace.c | 53 ++++++------- 2 files changed, 59 insertions(+), 195 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 020dedbdf066..0ce59369632f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -39,6 +39,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -static inline int mk_pid(struct pid_namespace *pid_ns, - struct pidmap *map, int off) -{ - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; -} - -#define find_next_offset(map, off) \ - find_next_zero_bit((map)->page, BITS_PER_PAGE, off) /* * PID-map pages start out as NULL, they get allocated upon @@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns, */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .pidmap = { - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } - }, - .last_pid = 0, + .idr = IDR_INIT, .nr_hashed = PIDNS_HASH_ADDING, .level = 0, .child_reaper = &init_task, @@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static void free_pidmap(struct upid *upid) -{ - int nr = upid->nr; - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; - int offset = nr & BITS_PER_PAGE_MASK; - - clear_bit(offset, map->page); - atomic_inc(&map->nr_free); -} - -/* - * If we started walking pids at 'base', is 'a' seen before 'b'? - */ -static int pid_before(int base, int a, int b) -{ - /* - * This is the same as saying - * - * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT - * and that mapping orders 'a' and 'b' with respect to 'base'. - */ - return (unsigned)(a - base) < (unsigned)(b - base); -} - -/* - * We might be racing with someone else trying to set pid_ns->last_pid - * at the pid allocation time (there's also a sysctl for this, but racing - * with this one is OK, see comment in kernel/pid_namespace.c about it). - * We want the winner to have the "later" value, because if the - * "earlier" value prevails, then a pid may get reused immediately. - * - * Since pids rollover, it is not sufficient to just pick the bigger - * value. We have to consider where we started counting from. - * - * 'base' is the value of pid_ns->last_pid that we observed when - * we started looking for a pid. - * - * 'pid' is the pid that we eventually found. - */ -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) -{ - int prev; - int last_write = base; - do { - prev = last_write; - last_write = cmpxchg(&pid_ns->last_pid, prev, pid); - } while ((prev != last_write) && (pid_before(base, last_write, pid))); -} - -static int alloc_pidmap(struct pid_namespace *pid_ns) -{ - int i, offset, max_scan, pid, last = pid_ns->last_pid; - struct pidmap *map; - - pid = last + 1; - if (pid >= pid_max) - pid = RESERVED_PIDS; - offset = pid & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - /* - * If last_pid points into the middle of the map->page we - * want to scan this bitmap block twice, the second time - * we start with offset == 0 (or RESERVED_PIDS). - */ - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; - for (i = 0; i <= max_scan; ++i) { - if (unlikely(!map->page)) { - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irq(&pidmap_lock); - if (!map->page) { - map->page = page; - page = NULL; - } - spin_unlock_irq(&pidmap_lock); - kfree(page); - if (unlikely(!map->page)) - return -ENOMEM; - } - if (likely(atomic_read(&map->nr_free))) { - for ( ; ; ) { - if (!test_and_set_bit(offset, map->page)) { - atomic_dec(&map->nr_free); - set_last_pid(pid_ns, last, pid); - return pid; - } - offset = find_next_offset(map, offset); - if (offset >= BITS_PER_PAGE) - break; - pid = mk_pid(pid_ns, map, offset); - if (pid >= pid_max) - break; - } - } - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { - ++map; - offset = 0; - } else { - map = &pid_ns->pidmap[0]; - offset = RESERVED_PIDS; - if (unlikely(last == offset)) - break; - } - pid = mk_pid(pid_ns, map, offset); - } - return -EAGAIN; -} - -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) -{ - int offset; - struct pidmap *map, *end; - - if (last >= PID_MAX_LIMIT) - return -1; - - offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pid_ns->pidmap[PIDMAP_ENTRIES]; - for (; map < end; map++, offset = 0) { - if (unlikely(!map->page)) - continue; - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); - if (offset < BITS_PER_PAGE) - return mk_pid(pid_ns, map, offset); - } - return -1; -} - void put_pid(struct pid *pid) { struct pid_namespace *ns; @@ -266,7 +124,7 @@ void free_pid(struct pid *pid) struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; hlist_del_rcu(&upid->pid_chain); - switch(--ns->nr_hashed) { + switch (--ns->nr_hashed) { case 2: case 1: /* When all that is left in the pid namespace @@ -284,12 +142,11 @@ void free_pid(struct pid *pid) schedule_work(&ns->proc_work); break; } + + idr_remove(&ns->idr, upid->nr); } spin_unlock_irqrestore(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - free_pidmap(pid->numbers + i); - call_rcu(&pid->rcu, delayed_put_pid); } @@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = ns; pid->level = ns->level; + for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); + int pid_min = 1; + + idr_preload(GFP_KERNEL); + spin_lock_irq(&pidmap_lock); + + /* + * init really needs pid 1, but after reaching the maximum + * wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; + + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + spin_unlock_irq(&pidmap_lock); + idr_preload_end(); + if (nr < 0) { retval = nr; goto out_free; @@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + /* Make the PID visible to find_pid_ns. */ + idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->nr_hashed++; } spin_unlock_irq(&pidmap_lock); @@ -350,8 +230,11 @@ out_unlock: put_pid_ns(ns); out_free: + spin_lock_irq(&pidmap_lock); while (++i <= ns->level) - free_pidmap(pid->numbers + i); + idr_remove(&ns->idr, (pid->numbers + i)->nr); + + spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns); */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { - struct pid *pid; - - do { - pid = find_pid_ns(nr, ns); - if (pid) - break; - nr = next_pidmap(ns, nr); - } while (nr > 0); - - return pid; + return idr_get_next(&ns->idr, &nr); } /* @@ -578,7 +452,7 @@ void __init pidhash_init(void) 0, 4096); } -void __init pidmap_init(void) +void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); @@ -590,10 +464,7 @@ void __init pidmap_init(void) PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pid_ns.pidmap[0].page); - atomic_dec(&init_pid_ns.pidmap[0].nr_free); + idr_init(&init_pid_ns.idr); init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 4918314893bc..ca7c8a8823b1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -21,6 +21,7 @@ #include #include #include +#include struct pid_cache { int nr_ids; @@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; struct ucounts *ucounts; - int i; int err; err = -EINVAL; @@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns == NULL) goto out_dec; - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!ns->pidmap[0].page) - goto out_free; + idr_init(&ns->idr); ns->pid_cachep = create_pid_cachep(level + 1); if (ns->pid_cachep == NULL) - goto out_free_map; + goto out_free_idr; err = ns_alloc_inum(&ns->ns); if (err) - goto out_free_map; + goto out_free_idr; ns->ns.ops = &pidns_operations; kref_init(&ns->kref); @@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); - set_bit(0, ns->pidmap[0].page); - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); - - for (i = 1; i < PIDMAP_ENTRIES; i++) - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - return ns; -out_free_map: - kfree(ns->pidmap[0].page); -out_free: +out_free_idr: + idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); out_dec: dec_pid_namespaces(ucounts); @@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { - int i; - ns_free_inum(&ns->ns); - for (i = 0; i < PIDMAP_ENTRIES; i++) - kfree(ns->pidmap[i].page); + + idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } @@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int rc; struct task_struct *task, *me = current; int init_pids = thread_group_leader(me) ? 1 : 2; + struct pid *pid; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * maintain a tasklist for each pid namespace. * */ + rcu_read_lock(); read_lock(&tasklist_lock); - nr = next_pidmap(pid_ns, 1); - while (nr > 0) { - rcu_read_lock(); - - task = pid_task(find_vpid(nr), PIDTYPE_PID); + nr = 2; + idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { + task = pid_task(pid, PIDTYPE_PID); if (task && !__fatal_signal_pending(task)) send_sig_info(SIGKILL, SEND_SIG_FORCED, task); - - rcu_read_unlock(); - - nr = next_pidmap(pid_ns, nr); } read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. @@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; + int ret, next; if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = &pid_ns->last_pid; - return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + next = idr_get_cursor(&pid_ns->idr) - 1; + + tmp.data = &next; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (!ret && write) + idr_set_cursor(&pid_ns->idr, next + 1); + + return ret; } extern int pid_max; -- cgit From e8cfbc245e24887e3c30235f71e9e9405e0cfc39 Mon Sep 17 00:00:00 2001 From: Gargi Sharma Date: Fri, 17 Nov 2017 15:30:34 -0800 Subject: pid: remove pidhash pidhash is no longer required as all the information can be looked up from idr tree. nr_hashed represented the number of pids that had been hashed. Since, nr_hashed and PIDNS_HASH_ADDING are no longer relevant, it has been renamed to pid_allocated and PIDNS_ADDING respectively. [gs051095@gmail.com: v6] Link: http://lkml.kernel.org/r/1507760379-21662-3-git-send-email-gs051095@gmail.com Link: http://lkml.kernel.org/r/1507583624-22146-3-git-send-email-gs051095@gmail.com Signed-off-by: Gargi Sharma Reviewed-by: Rik van Riel Tested-by: Tony Luck [ia64] Cc: Julia Lawall Cc: Ingo Molnar Cc: Pavel Tatashin Cc: Kirill Tkhai Cc: Oleg Nesterov Cc: Eric W. Biederman Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- kernel/pid.c | 48 ++++++++++-------------------------------------- kernel/pid_namespace.c | 6 +++--- 3 files changed, 14 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4e55eedba8d6..432eadf6b58c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1871,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process( retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } - if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_cancel_cgroup; } diff --git a/kernel/pid.c b/kernel/pid.c index 0ce59369632f..b13b624e2c49 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -41,10 +41,6 @@ #include #include -#define pid_hashfn(nr, ns) \ - hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) -static struct hlist_head *pid_hash; -static unsigned int pidhash_shift = 4; struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -54,7 +50,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; - /* * PID-map pages start out as NULL, they get allocated upon * first use and are never deallocated. This way a low pid_max @@ -64,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT; struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), .idr = IDR_INIT, - .nr_hashed = PIDNS_HASH_ADDING, + .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, @@ -123,8 +118,7 @@ void free_pid(struct pid *pid) for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; - hlist_del_rcu(&upid->pid_chain); - switch (--ns->nr_hashed) { + switch (--ns->pid_allocated) { case 2: case 1: /* When all that is left in the pid namespace @@ -133,10 +127,10 @@ void free_pid(struct pid *pid) */ wake_up_process(ns->child_reaper); break; - case PIDNS_HASH_ADDING: + case PIDNS_ADDING: /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); - ns->nr_hashed = 0; + ns->pid_allocated = 0; /* fall through */ case 0: schedule_work(&ns->proc_work); @@ -212,14 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns) upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) + if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; for ( ; upid >= pid->numbers; --upid) { - hlist_add_head_rcu(&upid->pid_chain, - &pid_hash[pid_hashfn(upid->nr, upid->ns)]); /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); - upid->ns->nr_hashed++; + upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); @@ -243,21 +235,13 @@ out_free: void disable_pid_allocation(struct pid_namespace *ns) { spin_lock_irq(&pidmap_lock); - ns->nr_hashed &= ~PIDNS_HASH_ADDING; + ns->pid_allocated &= ~PIDNS_ADDING; spin_unlock_irq(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { - struct upid *pnr; - - hlist_for_each_entry_rcu(pnr, - &pid_hash[pid_hashfn(nr, ns)], pid_chain) - if (pnr->nr == nr && pnr->ns == ns) - return container_of(pnr, struct pid, - numbers[ns->level]); - - return NULL; + return idr_find(&ns->idr, nr); } EXPORT_SYMBOL_GPL(find_pid_ns); @@ -413,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (type != PIDTYPE_PID) { if (type == __PIDTYPE_TGID) type = PIDTYPE_PID; + task = task->group_leader; } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); @@ -439,23 +424,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } -/* - * The pid hash table is scaled according to the amount of memory in the - * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or - * more. - */ -void __init pidhash_init(void) -{ - pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL | HASH_ZERO, - &pidhash_shift, NULL, - 0, 4096); -} - void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ - BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); + BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ca7c8a8823b1..0b53eef7d34b 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -133,7 +133,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; - ns->nr_hashed = PIDNS_HASH_ADDING; + ns->pid_allocated = PIDNS_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); return ns; @@ -254,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * sys_wait4() above can't reap the EXIT_DEAD children but we do not * really care, we could reparent them to the global init. We could * exit and reap ->child_reaper even if it is not the last thread in - * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(), + * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), * pid_ns can not go away until proc_kill_sb() drops the reference. * * But this ns can also have other tasks injected by setns()+fork(). @@ -268,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); - if (pid_ns->nr_hashed == init_pids) + if (pid_ns->pid_allocated == init_pids) break; schedule(); } -- cgit From 4efb442cc12eb66535b7c7ed06005fd7889c1d77 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 17 Nov 2017 15:30:38 -0800 Subject: kernel/panic.c: add TAINT_AUX This is the gist of a patch which we've been forward-porting in our kernels for a long time now and it probably would make a good sense to have such TAINT_AUX flag upstream which can be used by each distro etc, how they see fit. This way, we won't need to forward-port a distro-only version indefinitely. Add an auxiliary taint flag to be used by distros and others. This obviates the need to forward-port whatever internal solutions people have in favor of a single flag which they can map arbitrarily to a definition of their pleasing. The "X" mnemonic could also mean eXternal, which would be taint from a distro or something else but not the upstream kernel. We will use it to mark modules for which we don't provide support. I.e., a really eXternal module. Link: http://lkml.kernel.org/r/20170911134533.dp5mtyku5bongx4c@pd.tnic Signed-off-by: Borislav Petkov Cc: Kees Cook Cc: Jessica Yu Cc: Peter Zijlstra Cc: Jiri Slaby Cc: Jiri Olsa Cc: Michal Marek Cc: Jiri Kosina Cc: Takashi Iwai Cc: Petr Mladek Cc: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 3242b64b1956..2cfef408fec9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -324,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { { 'E', ' ', true }, /* TAINT_UNSIGNED_MODULE */ { 'L', ' ', false }, /* TAINT_SOFTLOCKUP */ { 'K', ' ', true }, /* TAINT_LIVEPATCH */ + { 'X', ' ', true }, /* TAINT_AUX */ }; /** @@ -345,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { * 'E' - Unsigned module has been loaded. * 'L' - A soft lockup has previously occurred. * 'K' - Kernel has been live patched. + * 'X' - Auxiliary taint, for distros' use. * * The string is overwritten by the next call to print_tainted(). */ -- cgit From fcf4edac049a8bca41658970292e2dfdbc9d5f62 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 17 Nov 2017 15:30:42 -0800 Subject: kcov: remove pointless current != NULL check __sanitizer_cov_trace_pc() is a hot code, so it's worth to remove pointless '!current' check. Current is never NULL. Link: http://lkml.kernel.org/r/20170929162221.32500-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Dmitry Vyukov Acked-by: Mark Rutland Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index fc6af9e1308b..d9f9fa9cacc6 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -62,7 +62,7 @@ void notrace __sanitizer_cov_trace_pc(void) * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. */ - if (!t || !in_task()) + if (!in_task()) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { -- cgit From ded97d2c2b2c5f1dcced0bc57133f7753b037dfc Mon Sep 17 00:00:00 2001 From: Victor Chibotaru Date: Fri, 17 Nov 2017 15:30:46 -0800 Subject: kcov: support comparison operands collection Enables kcov to collect comparison operands from instrumented code. This is done by using Clang's -fsanitize=trace-cmp instrumentation (currently not available for GCC). The comparison operands help a lot in fuzz testing. E.g. they are used in Syzkaller to cover the interiors of conditional statements with way less attempts and thus make previously unreachable code reachable. To allow separate collection of coverage and comparison operands two different work modes are implemented. Mode selection is now done via a KCOV_ENABLE ioctl call with corresponding argument value. Link: http://lkml.kernel.org/r/20171011095459.70721-1-glider@google.com Signed-off-by: Victor Chibotaru Signed-off-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Mark Rutland Cc: Alexander Popov Cc: Andrey Ryabinin Cc: Kees Cook Cc: Vegard Nossum Cc: Quentin Casasnovas Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 179 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index d9f9fa9cacc6..15f33faf4013 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -22,13 +22,21 @@ #include #include +/* Number of 64-bit words written per one comparison: */ +#define KCOV_WORDS_PER_CMP 4 + /* * kcov descriptor (one per opened debugfs file). * State transitions of the descriptor: * - initial state after open() * - then there must be a single ioctl(KCOV_INIT_TRACE) call * - then, mmap() call (several calls are allowed but not useful) - * - then, repeated enable/disable for a task (only one task a time allowed) + * - then, ioctl(KCOV_ENABLE, arg), where arg is + * KCOV_TRACE_PC - to trace only the PCs + * or + * KCOV_TRACE_CMP - to trace only the comparison operands + * - then, ioctl(KCOV_DISABLE) to disable the task. + * Enabling/disabling ioctls can be repeated (only one task a time allowed). */ struct kcov { /* @@ -48,51 +56,176 @@ struct kcov { struct task_struct *t; }; -/* - * Entry point from instrumented code. - * This is called once per basic-block/edge. - */ -void notrace __sanitizer_cov_trace_pc(void) +static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { - struct task_struct *t; enum kcov_mode mode; - t = current; /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. */ if (!in_task()) - return; + return false; mode = READ_ONCE(t->kcov_mode); - if (mode == KCOV_MODE_TRACE) { - unsigned long *area; - unsigned long pos; - unsigned long ip = _RET_IP_; + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + return mode == needed_mode; +} +static unsigned long canonicalize_ip(unsigned long ip) +{ #ifdef CONFIG_RANDOMIZE_BASE - ip -= kaslr_offset(); + ip -= kaslr_offset(); #endif + return ip; +} - /* - * There is some code that runs in interrupts but for which - * in_interrupt() returns false (e.g. preempt_schedule_irq()). - * READ_ONCE()/barrier() effectively provides load-acquire wrt - * interrupts, there are paired barrier()/WRITE_ONCE() in - * kcov_ioctl_locked(). - */ - barrier(); - area = t->kcov_area; - /* The first word is number of subsequent PCs. */ - pos = READ_ONCE(area[0]) + 1; - if (likely(pos < t->kcov_size)) { - area[pos] = ip; - WRITE_ONCE(area[0], pos); - } +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void notrace __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + unsigned long *area; + unsigned long ip = canonicalize_ip(_RET_IP_); + unsigned long pos; + + t = current; + if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t)) + return; + + area = t->kcov_area; + /* The first 64-bit word is the number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = ip; + WRITE_ONCE(area[0], pos); } } EXPORT_SYMBOL(__sanitizer_cov_trace_pc); +#ifdef CONFIG_KCOV_ENABLE_COMPARISONS +static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) +{ + struct task_struct *t; + u64 *area; + u64 count, start_index, end_pos, max_pos; + + t = current; + if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t)) + return; + + ip = canonicalize_ip(ip); + + /* + * We write all comparison arguments and types as u64. + * The buffer was allocated for t->kcov_size unsigned longs. + */ + area = (u64 *)t->kcov_area; + max_pos = t->kcov_size * sizeof(unsigned long); + + count = READ_ONCE(area[0]); + + /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */ + start_index = 1 + count * KCOV_WORDS_PER_CMP; + end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64); + if (likely(end_pos <= max_pos)) { + area[start_index] = type; + area[start_index + 1] = arg1; + area[start_index + 2] = arg2; + area[start_index + 3] = ip; + WRITE_ONCE(area[0], count + 1); + } +} + +void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1); + +void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); + +void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4); + +void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8); + +void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1); + +void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); + +void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4); + +void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8); + +void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) +{ + u64 i; + u64 count = cases[0]; + u64 size = cases[1]; + u64 type = KCOV_CMP_CONST; + + switch (size) { + case 8: + type |= KCOV_CMP_SIZE(0); + break; + case 16: + type |= KCOV_CMP_SIZE(1); + break; + case 32: + type |= KCOV_CMP_SIZE(2); + break; + case 64: + type |= KCOV_CMP_SIZE(3); + break; + default: + return; + } + for (i = 0; i < count; i++) + write_comp_data(type, cases[i + 2], val, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_switch); +#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ + static void kcov_get(struct kcov *kcov) { atomic_inc(&kcov->refcount); @@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t) /* Just to not leave dangling references behind. */ kcov_task_init(t); kcov->t = NULL; + kcov->mode = KCOV_MODE_INIT; spin_unlock(&kcov->lock); kcov_put(kcov); } @@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) spin_lock(&kcov->lock); size = kcov->size * sizeof(unsigned long); - if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 || + if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { res = -EINVAL; goto exit; @@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep) kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); if (!kcov) return -ENOMEM; + kcov->mode = KCOV_MODE_DISABLED; atomic_set(&kcov->refcount, 1); spin_lock_init(&kcov->lock); filep->private_data = kcov; @@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, if (size < 2 || size > INT_MAX / sizeof(unsigned long)) return -EINVAL; kcov->size = size; - kcov->mode = KCOV_MODE_TRACE; + kcov->mode = KCOV_MODE_INIT; return 0; case KCOV_ENABLE: /* @@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, * at task exit or voluntary by KCOV_DISABLE. After that it can * be enabled for another task. */ - unused = arg; - if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED || - kcov->area == NULL) + if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; if (kcov->t != NULL) return -EBUSY; + if (arg == KCOV_TRACE_PC) + kcov->mode = KCOV_MODE_TRACE_PC; + else if (arg == KCOV_TRACE_CMP) +#ifdef CONFIG_KCOV_ENABLE_COMPARISONS + kcov->mode = KCOV_MODE_TRACE_CMP; +#else + return -ENOTSUPP; +#endif + else + return -EINVAL; t = current; /* Cache in task struct for performance. */ t->kcov_size = kcov->size; t->kcov_area = kcov->area; - /* See comment in __sanitizer_cov_trace_pc(). */ + /* See comment in check_kcov_mode(). */ barrier(); WRITE_ONCE(t->kcov_mode, kcov->mode); t->kcov = kcov; @@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return -EINVAL; kcov_task_init(t); kcov->t = NULL; + kcov->mode = KCOV_MODE_INIT; kcov_put(kcov); return 0; default: -- cgit From 2d8364bae4db144df75ba85e92d2b8619ba8eedc Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Fri, 17 Nov 2017 15:30:57 -0800 Subject: kernel/reboot.c: add devm_register_reboot_notifier() Add devm_* wrapper around register_reboot_notifier to simplify device specific reboot notifier registration/unregistration. [akpm@linux-foundation.org: move `struct device' forward decl to top-of-file] Link: http://lkml.kernel.org/r/20170320171753.1705-1-andrew.smirnov@gmail.com Signed-off-by: Andrey Smirnov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index bd30a973fe94..e4ced883d8de 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +static void devm_unregister_reboot_notifier(struct device *dev, void *res) +{ + WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res)); +} + +int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb) +{ + struct notifier_block **rcnb; + int ret; + + rcnb = devres_alloc(devm_unregister_reboot_notifier, + sizeof(*rcnb), GFP_KERNEL); + if (!rcnb) + return -ENOMEM; + + ret = register_reboot_notifier(nb); + if (!ret) { + *rcnb = nb; + devres_add(dev, rcnb); + } else { + devres_free(rcnb); + } + + return ret; +} +EXPORT_SYMBOL(devm_register_reboot_notifier); + /* * Notifier list for kernel code which wants to be called * to restart the system. -- cgit From 24ed960abf1d50cb7834e99a0cfc081bc0656712 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 28 Aug 2017 11:28:21 -0700 Subject: treewide: Switch DEFINE_TIMER callbacks to struct timer_list * This changes all DEFINE_TIMER() callbacks to use a struct timer_list pointer instead of unsigned long. Since the data argument has already been removed, none of these callbacks are using their argument currently, so this renames the argument to "unused". Done using the following semantic patch: @match_define_timer@ declarer name DEFINE_TIMER; identifier _timer, _callback; @@ DEFINE_TIMER(_timer, _callback); @change_callback depends on match_define_timer@ identifier match_define_timer._callback; type _origtype; identifier _origarg; @@ void -_callback(_origtype _origarg) +_callback(struct timer_list *unused) { ... } Signed-off-by: Kees Cook --- kernel/irq/spurious.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 1215229d1c12..ef2a47e0eab6 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -20,7 +20,7 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) -static void poll_spurious_irqs(unsigned long dummy); +static void poll_spurious_irqs(struct timer_list *unused); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs); static int irq_poll_cpu; static atomic_t irq_poll_active; @@ -143,7 +143,7 @@ out: return ok; } -static void poll_spurious_irqs(unsigned long dummy) +static void poll_spurious_irqs(struct timer_list *unused) { struct irq_desc *desc; int i; -- cgit From b9eaf18722221ef8b2bd6a67240ebe668622152a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 13:15:39 -0700 Subject: treewide: init_timer() -> setup_timer() This mechanically converts all remaining cases of ancient open-coded timer setup with the old setup_timer() API, which is the first step in timer conversions. This has no behavioral changes, since it ultimately just changes the order of assignment to fields of struct timer_list when finding variations of: init_timer(&t); f.function = timer_callback; t.data = timer_callback_arg; to be converted into: setup_timer(&t, timer_callback, timer_callback_arg); The conversion is done with the following Coccinelle script, which is an improved version of scripts/cocci/api/setup_timer.cocci, in the following ways: - assignments-before-init_timer() cases - limit the .data case removal to the specific struct timer_list instance - handling calls by dereference (timer->field vs timer.field) spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/setup_timer.cocci @fix_address_of@ expression e; @@ init_timer( -&(e) +&e , ...) // Match the common cases first to avoid Coccinelle parsing loops with // "... when" clauses. @match_immediate_function_data_after_init_timer@ expression e, func, da; @@ -init_timer +setup_timer ( \(&e\|e\) +, func, da ); ( -\(e.function\|e->function\) = func; -\(e.data\|e->data\) = da; | -\(e.data\|e->data\) = da; -\(e.function\|e->function\) = func; ) @match_immediate_function_data_before_init_timer@ expression e, func, da; @@ ( -\(e.function\|e->function\) = func; -\(e.data\|e->data\) = da; | -\(e.data\|e->data\) = da; -\(e.function\|e->function\) = func; ) -init_timer +setup_timer ( \(&e\|e\) +, func, da ); @match_function_and_data_after_init_timer@ expression e, e2, e3, e4, e5, func, da; @@ -init_timer +setup_timer ( \(&e\|e\) +, func, da ); ... when != func = e2 when != da = e3 ( -e.function = func; ... when != da = e4 -e.data = da; | -e->function = func; ... when != da = e4 -e->data = da; | -e.data = da; ... when != func = e5 -e.function = func; | -e->data = da; ... when != func = e5 -e->function = func; ) @match_function_and_data_before_init_timer@ expression e, e2, e3, e4, e5, func, da; @@ ( -e.function = func; ... when != da = e4 -e.data = da; | -e->function = func; ... when != da = e4 -e->data = da; | -e.data = da; ... when != func = e5 -e.function = func; | -e->data = da; ... when != func = e5 -e->function = func; ) ... when != func = e2 when != da = e3 -init_timer +setup_timer ( \(&e\|e\) +, func, da ); @r1 exists@ expression t; identifier f; position p; @@ f(...) { ... when any init_timer@p(\(&t\|t\)) ... when any } @r2 exists@ expression r1.t; identifier g != r1.f; expression e8; @@ g(...) { ... when any \(t.data\|t->data\) = e8 ... when any } // It is dangerous to use setup_timer if data field is initialized // in another function. @script:python depends on r2@ p << r1.p; @@ cocci.include_match(False) @r3@ expression r1.t, func, e7; position r1.p; @@ ( -init_timer@p(&t); +setup_timer(&t, func, 0UL); ... when != func = e7 -t.function = func; | -t.function = func; ... when != func = e7 -init_timer@p(&t); +setup_timer(&t, func, 0UL); | -init_timer@p(t); +setup_timer(t, func, 0UL); ... when != func = e7 -t->function = func; | -t->function = func; ... when != func = e7 -init_timer@p(t); +setup_timer(t, func, 0UL); ) Signed-off-by: Kees Cook --- kernel/time/clocksource.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 03918a19cf2d..5b51d5ba2a85 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -290,8 +290,7 @@ static inline void clocksource_start_watchdog(void) { if (watchdog_running || !watchdog || list_empty(&watchdog_list)) return; - init_timer(&watchdog_timer); - watchdog_timer.function = clocksource_watchdog; + setup_timer(&watchdog_timer, clocksource_watchdog, 0UL); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); watchdog_running = 1; -- cgit From e99e88a9d2b067465adaa9c111ada99a041bef9a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 14:43:17 -0700 Subject: treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook --- kernel/padata.c | 6 +++--- kernel/time/clocksource.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index f262c9a4e70a..57c0074d50cc 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -288,9 +288,9 @@ static void invoke_padata_reorder(struct work_struct *work) local_bh_enable(); } -static void padata_reorder_timer(unsigned long arg) +static void padata_reorder_timer(struct timer_list *t) { - struct parallel_data *pd = (struct parallel_data *)arg; + struct parallel_data *pd = from_timer(pd, t, timer); unsigned int weight; int target_cpu, cpu; @@ -485,7 +485,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); - setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); + timer_setup(&pd->timer, padata_reorder_timer, 0); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 5b51d5ba2a85..65f9e3f24dde 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -171,7 +171,7 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -static void clocksource_watchdog(unsigned long data) +static void clocksource_watchdog(struct timer_list *unused) { struct clocksource *cs; u64 csnow, wdnow, cslast, wdlast, delta; @@ -290,7 +290,7 @@ static inline void clocksource_start_watchdog(void) { if (watchdog_running || !watchdog || list_empty(&watchdog_list)) return; - setup_timer(&watchdog_timer, clocksource_watchdog, 0UL); + timer_setup(&watchdog_timer, clocksource_watchdog, 0); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); watchdog_running = 1; -- cgit From c1eba5bcb6430868427e0b9d1cd1205a07302f06 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 18:18:19 -0700 Subject: timer: Pass timer_list pointer to callbacks unconditionally Now that all timer callbacks are already taking their struct timer_list pointer as the callback argument, just do this unconditionally and remove the .data field. Cc: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Signed-off-by: Kees Cook --- kernel/time/timer.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index af0b8bae4502..a07eb124332f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1107,12 +1107,12 @@ EXPORT_SYMBOL(timer_reduce); * add_timer - start a timer * @timer: the timer to be added * - * The kernel will do a ->function(->data) callback from the + * The kernel will do a ->function(@timer) callback from the * timer interrupt at the ->expires point in the future. The * current time is 'jiffies'. * - * The timer's ->expires, ->function (and if the handler uses it, ->data) - * fields must be set prior calling this function. + * The timer's ->expires, ->function fields must be set prior calling this + * function. * * Timers with an ->expires field in the past will be executed in the next * timer tick. @@ -1284,8 +1284,7 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), - unsigned long data) +static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long)) { int count = preempt_count(); @@ -1309,7 +1308,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), lock_map_acquire(&lockdep_map); trace_timer_expire_entry(timer); - fn(data); + fn((TIMER_DATA_TYPE)timer); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); @@ -1332,7 +1331,6 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(unsigned long); - unsigned long data; timer = hlist_entry(head->first, struct timer_list, entry); @@ -1340,15 +1338,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) detach_timer(timer, true); fn = timer->function; - data = timer->data; if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); - call_timer_fn(timer, fn, data); + call_timer_fn(timer, fn); raw_spin_lock(&base->lock); } else { raw_spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn, data); + call_timer_fn(timer, fn); raw_spin_lock_irq(&base->lock); } } -- cgit From 354b46b1a0adda1dd5b7f0bc2a5604cca091be5f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 19:15:40 -0700 Subject: timer: Switch callback prototype to take struct timer_list * argument Since all callbacks have been converted, we can switch the core prototype to "struct timer_list *" now too. Cc: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Signed-off-by: Kees Cook --- kernel/time/timer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a07eb124332f..0f0d49a02d04 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1284,7 +1284,7 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long)) +static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) { int count = preempt_count(); @@ -1308,7 +1308,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long)) lock_map_acquire(&lockdep_map); trace_timer_expire_entry(timer); - fn((TIMER_DATA_TYPE)timer); + fn(timer); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); @@ -1330,7 +1330,7 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) { while (!hlist_empty(head)) { struct timer_list *timer; - void (*fn)(unsigned long); + void (*fn)(struct timer_list *); timer = hlist_entry(head->first, struct timer_list, entry); -- cgit From 188665b2d67db8953899551d1a9d4481b2a0ac60 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 22 Oct 2017 18:14:46 -0700 Subject: timer: Pass function down to initialization routines In preparation for removing more macros, pass the function down to the initialization routines instead of doing it in macros. Cc: Thomas Gleixner Cc: John Stultz Cc: Stephen Boyd Signed-off-by: Kees Cook --- kernel/time/timer.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 0f0d49a02d04..ffebcf878fba 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -707,14 +707,18 @@ static inline void debug_timer_assert_init(struct timer_list *timer) debug_object_assert_init(timer, &timer_debug_descr); } -static void do_init_timer(struct timer_list *timer, unsigned int flags, +static void do_init_timer(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, +void init_timer_on_stack_key(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, const char *name, struct lock_class_key *key) { debug_object_init_on_stack(timer, &timer_debug_descr); - do_init_timer(timer, flags, name, key); + do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL_GPL(init_timer_on_stack_key); @@ -755,10 +759,13 @@ static inline void debug_assert_init(struct timer_list *timer) debug_timer_assert_init(timer); } -static void do_init_timer(struct timer_list *timer, unsigned int flags, +static void do_init_timer(struct timer_list *timer, + void (*func)(struct timer_list *), + unsigned int flags, const char *name, struct lock_class_key *key) { timer->entry.pprev = NULL; + timer->function = func; timer->flags = flags | raw_smp_processor_id(); lockdep_init_map(&timer->lockdep_map, name, key, 0); } @@ -766,6 +773,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, /** * init_timer_key - initialize a timer * @timer: the timer to be initialized + * @func: timer callback function * @flags: timer flags * @name: name of the timer * @key: lockdep class key of the fake lock used for tracking timer @@ -774,11 +782,12 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, * init_timer_key() must be done to a timer prior calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, unsigned int flags, +void init_timer_key(struct timer_list *timer, + void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); - do_init_timer(timer, flags, name, key); + do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL(init_timer_key); -- cgit From 841b86f3289dbe858daeceec36423d4ea286fac2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 23 Oct 2017 09:40:42 +0200 Subject: treewide: Remove TIMER_FUNC_TYPE and TIMER_DATA_TYPE casts With all callbacks converted, and the timer callback prototype switched over, the TIMER_FUNC_TYPE cast is no longer needed, so remove it. Conversion was done with the following scripts: perl -pi -e 's|\(TIMER_FUNC_TYPE\)||g' \ $(git grep TIMER_FUNC_TYPE | cut -d: -f1 | sort -u) perl -pi -e 's|\(TIMER_DATA_TYPE\)||g' \ $(git grep TIMER_DATA_TYPE | cut -d: -f1 | sort -u) The now unused macros are also dropped from include/linux/timer.h. Signed-off-by: Kees Cook --- kernel/kthread.c | 2 +- kernel/workqueue.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 8af313081b0d..cd50e99202b0 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -843,7 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker, struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; - WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn); + WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dde6298f6b22..8fdb710bfdd7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1509,7 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct work_struct *work = &dwork->work; WARN_ON_ONCE(!wq); - WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn); + WARN_ON_ONCE(timer->function != delayed_work_timer_fn); WARN_ON_ONCE(timer_pending(timer)); WARN_ON_ONCE(!list_empty(&work->entry)); -- cgit