diff options
Diffstat (limited to 'kernel')
154 files changed, 7487 insertions, 3171 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 547c88be8a28..addeed3df15d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -64,6 +64,7 @@  #include <uapi/linux/limits.h>  #include <uapi/linux/netfilter/nf_tables.h>  #include <uapi/linux/openat2.h> // struct open_how +#include <uapi/linux/fanotify.h>  #include "audit.h" @@ -1294,15 +1295,11 @@ out:  static void audit_log_cap(struct audit_buffer *ab, char *prefix,  			  kernel_cap_t *cap)  { -	int i; -  	if (cap_isclear(*cap)) {  		audit_log_format(ab, " %s=0", prefix);  		return;  	} -	audit_log_format(ab, " %s=", prefix); -	CAP_FOR_EACH_U32(i) -		audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); +	audit_log_format(ab, " %s=%016llx", prefix, cap->val);  }  static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) @@ -2252,7 +2249,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,  	if (!dentry)  		return 0; -	rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps); +	rc = get_vfs_caps_from_disk(&nop_mnt_idmap, dentry, &caps);  	if (rc)  		return rc; @@ -2807,7 +2804,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,  	ax->d.next = context->aux;  	context->aux = (void *)ax; -	get_vfs_caps_from_disk(&init_user_ns, +	get_vfs_caps_from_disk(&nop_mnt_idmap,  			       bprm->file->f_path.dentry, &vcaps);  	ax->fcap.permitted = vcaps.permitted; @@ -2877,10 +2874,21 @@ void __audit_log_kern_module(char *name)  	context->type = AUDIT_KERN_MODULE;  } -void __audit_fanotify(unsigned int response) +void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)  { -	audit_log(audit_context(), GFP_KERNEL, -		AUDIT_FANOTIFY,	"resp=%u", response); +	/* {subj,obj}_trust values are {0,1,2}: no,yes,unknown */ +	switch (friar->hdr.type) { +	case FAN_RESPONSE_INFO_NONE: +		audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, +			  "resp=%u fan_type=%u fan_info=0 subj_trust=2 obj_trust=2", +			  response, FAN_RESPONSE_INFO_NONE); +		break; +	case FAN_RESPONSE_INFO_AUDIT_RULE: +		audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, +			  "resp=%u fan_type=%u fan_info=%X subj_trust=%u obj_trust=%u", +			  response, friar->hdr.type, friar->rule_number, +			  friar->subj_trust, friar->obj_trust); +	}  }  void __audit_tk_injoffset(struct timespec64 offset) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3a12e6b400a2..02242614dcc7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o  endif  ifeq ($(CONFIG_BPF_JIT),y)  obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +obj-$(CONFIG_BPF_SYSCALL) += cpumask.o  obj-${CONFIG_BPF_LSM} += bpf_lsm.o  endif  obj-$(CONFIG_BPF_PRELOAD) += preload/ diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b39a46e8fb08..35f4138a54dc 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -568,8 +568,8 @@ static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_att  	nbuckets = max_t(u32, 2, nbuckets);  	smap->bucket_log = ilog2(nbuckets); -	smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, -				 GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); +	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), +					 nbuckets, GFP_USER | __GFP_NOWARN);  	if (!smap->buckets) {  		bpf_map_area_free(smap);  		return ERR_PTR(-ENOMEM); @@ -580,8 +580,8 @@ static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_att  		raw_spin_lock_init(&smap->buckets[i].lock);  	} -	smap->elem_size = -		sizeof(struct bpf_local_storage_elem) + attr->value_size; +	smap->elem_size = offsetof(struct bpf_local_storage_elem, +				   sdata.data[attr->value_size]);  	return smap;  } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index a4a41ee3e80b..e14c822f8911 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -51,7 +51,6 @@ BTF_SET_END(bpf_lsm_current_hooks)   */  BTF_SET_START(bpf_lsm_locked_sockopt_hooks)  #ifdef CONFIG_SECURITY_NETWORK -BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)  BTF_ID(func, bpf_lsm_sock_graft)  BTF_ID(func, bpf_lsm_inet_csk_clone)  BTF_ID(func, bpf_lsm_inet_conn_established) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7dd8af06413..73780748404c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -337,6 +337,12 @@ const char *btf_type_str(const struct btf_type *t)  #define BTF_SHOW_NAME_SIZE		80  /* + * The suffix of a type that indicates it cannot alias another type when + * comparing BTF IDs for kfunc invocations. + */ +#define NOCAST_ALIAS_SUFFIX		"___init" + +/*   * Common data to all BTF show operations. Private show functions can add   * their own data to a structure containing a struct btf_show and consult it   * in the show callback.  See btf_type_show() below. @@ -1397,12 +1403,18 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,  	if (!bpf_verifier_log_needed(log))  		return; -	/* btf verifier prints all types it is processing via -	 * btf_verifier_log_type(..., fmt = NULL). -	 * Skip those prints for in-kernel BTF verification. -	 */ -	if (log->level == BPF_LOG_KERNEL && !fmt) -		return; +	if (log->level == BPF_LOG_KERNEL) { +		/* btf verifier prints all types it is processing via +		 * btf_verifier_log_type(..., fmt = NULL). +		 * Skip those prints for in-kernel BTF verification. +		 */ +		if (!fmt) +			return; + +		/* Skip logging when loading module BTF with mismatches permitted */ +		if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) +			return; +	}  	__btf_verifier_log(log, "[%u] %s %s%s",  			   env->log_type_id, @@ -1441,8 +1453,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,  	if (!bpf_verifier_log_needed(log))  		return; -	if (log->level == BPF_LOG_KERNEL && !fmt) -		return; +	if (log->level == BPF_LOG_KERNEL) { +		if (!fmt) +			return; + +		/* Skip logging when loading module BTF with mismatches permitted */ +		if (env->btf->base_btf && IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) +			return; +	} +  	/* The CHECK_META phase already did a btf dump.  	 *  	 * If member is logged again, it must hit an error in @@ -3228,7 +3247,7 @@ struct btf_field_info {  		struct {  			const char *node_name;  			u32 value_btf_id; -		} list_head; +		} graph_root;  	};  }; @@ -3305,12 +3324,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf,  	return NULL;  } -static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt, -			      const struct btf_type *t, int comp_idx, -			      u32 off, int sz, struct btf_field_info *info) +static int +btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, +		    const struct btf_type *t, int comp_idx, u32 off, +		    int sz, struct btf_field_info *info, +		    enum btf_field_type head_type)  { +	const char *node_field_name;  	const char *value_type; -	const char *list_node;  	s32 id;  	if (!__btf_type_is_struct(t)) @@ -3320,26 +3341,32 @@ static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,  	value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");  	if (!value_type)  		return -EINVAL; -	list_node = strstr(value_type, ":"); -	if (!list_node) +	node_field_name = strstr(value_type, ":"); +	if (!node_field_name)  		return -EINVAL; -	value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN); +	value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN);  	if (!value_type)  		return -ENOMEM;  	id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);  	kfree(value_type);  	if (id < 0)  		return id; -	list_node++; -	if (str_is_empty(list_node)) +	node_field_name++; +	if (str_is_empty(node_field_name))  		return -EINVAL; -	info->type = BPF_LIST_HEAD; +	info->type = head_type;  	info->off = off; -	info->list_head.value_btf_id = id; -	info->list_head.node_name = list_node; +	info->graph_root.value_btf_id = id; +	info->graph_root.node_name = node_field_name;  	return BTF_FIELD_FOUND;  } +#define field_mask_test_name(field_type, field_type_str) \ +	if (field_mask & field_type && !strcmp(name, field_type_str)) { \ +		type = field_type;					\ +		goto end;						\ +	} +  static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,  			      int *align, int *sz)  { @@ -3363,18 +3390,11 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,  			goto end;  		}  	} -	if (field_mask & BPF_LIST_HEAD) { -		if (!strcmp(name, "bpf_list_head")) { -			type = BPF_LIST_HEAD; -			goto end; -		} -	} -	if (field_mask & BPF_LIST_NODE) { -		if (!strcmp(name, "bpf_list_node")) { -			type = BPF_LIST_NODE; -			goto end; -		} -	} +	field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); +	field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); +	field_mask_test_name(BPF_RB_ROOT,   "bpf_rb_root"); +	field_mask_test_name(BPF_RB_NODE,   "bpf_rb_node"); +  	/* Only return BPF_KPTR when all other types with matchable names fail */  	if (field_mask & BPF_KPTR) {  		type = BPF_KPTR_REF; @@ -3387,6 +3407,8 @@ end:  	return type;  } +#undef field_mask_test_name +  static int btf_find_struct_field(const struct btf *btf,  				 const struct btf_type *t, u32 field_mask,  				 struct btf_field_info *info, int info_cnt) @@ -3419,6 +3441,7 @@ static int btf_find_struct_field(const struct btf *btf,  		case BPF_SPIN_LOCK:  		case BPF_TIMER:  		case BPF_LIST_NODE: +		case BPF_RB_NODE:  			ret = btf_find_struct(btf, member_type, off, sz, field_type,  					      idx < info_cnt ? &info[idx] : &tmp);  			if (ret < 0) @@ -3432,8 +3455,11 @@ static int btf_find_struct_field(const struct btf *btf,  				return ret;  			break;  		case BPF_LIST_HEAD: -			ret = btf_find_list_head(btf, t, member_type, i, off, sz, -						 idx < info_cnt ? &info[idx] : &tmp); +		case BPF_RB_ROOT: +			ret = btf_find_graph_root(btf, t, member_type, +						  i, off, sz, +						  idx < info_cnt ? &info[idx] : &tmp, +						  field_type);  			if (ret < 0)  				return ret;  			break; @@ -3480,6 +3506,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,  		case BPF_SPIN_LOCK:  		case BPF_TIMER:  		case BPF_LIST_NODE: +		case BPF_RB_NODE:  			ret = btf_find_struct(btf, var_type, off, sz, field_type,  					      idx < info_cnt ? &info[idx] : &tmp);  			if (ret < 0) @@ -3493,8 +3520,11 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,  				return ret;  			break;  		case BPF_LIST_HEAD: -			ret = btf_find_list_head(btf, var, var_type, -1, off, sz, -						 idx < info_cnt ? &info[idx] : &tmp); +		case BPF_RB_ROOT: +			ret = btf_find_graph_root(btf, var, var_type, +						  -1, off, sz, +						  idx < info_cnt ? &info[idx] : &tmp, +						  field_type);  			if (ret < 0)  				return ret;  			break; @@ -3596,21 +3626,25 @@ end_btf:  	return ret;  } -static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, -			       struct btf_field_info *info) +static int btf_parse_graph_root(const struct btf *btf, +				struct btf_field *field, +				struct btf_field_info *info, +				const char *node_type_name, +				size_t node_type_align)  {  	const struct btf_type *t, *n = NULL;  	const struct btf_member *member;  	u32 offset;  	int i; -	t = btf_type_by_id(btf, info->list_head.value_btf_id); +	t = btf_type_by_id(btf, info->graph_root.value_btf_id);  	/* We've already checked that value_btf_id is a struct type. We  	 * just need to figure out the offset of the list_node, and  	 * verify its type.  	 */  	for_each_member(i, t, member) { -		if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off))) +		if (strcmp(info->graph_root.node_name, +			   __btf_name_by_offset(btf, member->name_off)))  			continue;  		/* Invalid BTF, two members with same name */  		if (n) @@ -3618,24 +3652,38 @@ static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,  		n = btf_type_by_id(btf, member->type);  		if (!__btf_type_is_struct(n))  			return -EINVAL; -		if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off))) +		if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off)))  			return -EINVAL;  		offset = __btf_member_bit_offset(n, member);  		if (offset % 8)  			return -EINVAL;  		offset /= 8; -		if (offset % __alignof__(struct bpf_list_node)) +		if (offset % node_type_align)  			return -EINVAL; -		field->list_head.btf = (struct btf *)btf; -		field->list_head.value_btf_id = info->list_head.value_btf_id; -		field->list_head.node_offset = offset; +		field->graph_root.btf = (struct btf *)btf; +		field->graph_root.value_btf_id = info->graph_root.value_btf_id; +		field->graph_root.node_offset = offset;  	}  	if (!n)  		return -ENOENT;  	return 0;  } +static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, +			       struct btf_field_info *info) +{ +	return btf_parse_graph_root(btf, field, info, "bpf_list_node", +					    __alignof__(struct bpf_list_node)); +} + +static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field, +			     struct btf_field_info *info) +{ +	return btf_parse_graph_root(btf, field, info, "bpf_rb_node", +					    __alignof__(struct bpf_rb_node)); +} +  struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,  				    u32 field_mask, u32 value_size)  { @@ -3698,7 +3746,13 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type  			if (ret < 0)  				goto end;  			break; +		case BPF_RB_ROOT: +			ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]); +			if (ret < 0) +				goto end; +			break;  		case BPF_LIST_NODE: +		case BPF_RB_NODE:  			break;  		default:  			ret = -EFAULT; @@ -3707,8 +3761,33 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type  		rec->cnt++;  	} -	/* bpf_list_head requires bpf_spin_lock */ -	if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) { +	/* bpf_{list_head, rb_node} require bpf_spin_lock */ +	if ((btf_record_has_field(rec, BPF_LIST_HEAD) || +	     btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { +		ret = -EINVAL; +		goto end; +	} + +	/* need collection identity for non-owning refs before allowing this +	 * +	 * Consider a node type w/ both list and rb_node fields: +	 *   struct node { +	 *     struct bpf_list_node l; +	 *     struct bpf_rb_node r; +	 *   } +	 * +	 * Used like so: +	 *   struct node *n = bpf_obj_new(....); +	 *   bpf_list_push_front(&list_head, &n->l); +	 *   bpf_rbtree_remove(&rb_root, &n->r); +	 * +	 * It should not be possible to rbtree_remove the node since it hasn't +	 * been added to a tree. But push_front converts n to a non-owning +	 * reference, and rbtree_remove accepts the non-owning reference to +	 * a type w/ bpf_rb_node field. +	 */ +	if (btf_record_has_field(rec, BPF_LIST_NODE) && +	    btf_record_has_field(rec, BPF_RB_NODE)) {  		ret = -EINVAL;  		goto end;  	} @@ -3719,62 +3798,76 @@ end:  	return ERR_PTR(ret);  } +#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT) +#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE) +  int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)  {  	int i; -	/* There are two owning types, kptr_ref and bpf_list_head. The former -	 * only supports storing kernel types, which can never store references -	 * to program allocated local types, atleast not yet. Hence we only need -	 * to ensure that bpf_list_head ownership does not form cycles. +	/* There are three types that signify ownership of some other type: +	 *  kptr_ref, bpf_list_head, bpf_rb_root. +	 * kptr_ref only supports storing kernel types, which can't store +	 * references to program allocated local types. +	 * +	 * Hence we only need to ensure that bpf_{list_head,rb_root} ownership +	 * does not form cycles.  	 */ -	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_LIST_HEAD)) +	if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK))  		return 0;  	for (i = 0; i < rec->cnt; i++) {  		struct btf_struct_meta *meta;  		u32 btf_id; -		if (!(rec->fields[i].type & BPF_LIST_HEAD)) +		if (!(rec->fields[i].type & GRAPH_ROOT_MASK))  			continue; -		btf_id = rec->fields[i].list_head.value_btf_id; +		btf_id = rec->fields[i].graph_root.value_btf_id;  		meta = btf_find_struct_meta(btf, btf_id);  		if (!meta)  			return -EFAULT; -		rec->fields[i].list_head.value_rec = meta->record; +		rec->fields[i].graph_root.value_rec = meta->record; -		if (!(rec->field_mask & BPF_LIST_NODE)) +		/* We need to set value_rec for all root types, but no need +		 * to check ownership cycle for a type unless it's also a +		 * node type. +		 */ +		if (!(rec->field_mask & GRAPH_NODE_MASK))  			continue;  		/* We need to ensure ownership acyclicity among all types. The  		 * proper way to do it would be to topologically sort all BTF  		 * IDs based on the ownership edges, since there can be multiple -		 * bpf_list_head in a type. Instead, we use the following -		 * reasoning: +		 * bpf_{list_head,rb_node} in a type. Instead, we use the +		 * following resaoning:  		 *  		 * - A type can only be owned by another type in user BTF if it -		 *   has a bpf_list_node. +		 *   has a bpf_{list,rb}_node. Let's call these node types.  		 * - A type can only _own_ another type in user BTF if it has a -		 *   bpf_list_head. +		 *   bpf_{list_head,rb_root}. Let's call these root types.  		 * -		 * We ensure that if a type has both bpf_list_head and -		 * bpf_list_node, its element types cannot be owning types. +		 * We ensure that if a type is both a root and node, its +		 * element types cannot be root types.  		 *  		 * To ensure acyclicity:  		 * -		 * When A only has bpf_list_head, ownership chain can be: +		 * When A is an root type but not a node, its ownership +		 * chain can be:  		 *	A -> B -> C  		 * Where: -		 * - B has both bpf_list_head and bpf_list_node. -		 * - C only has bpf_list_node. +		 * - A is an root, e.g. has bpf_rb_root. +		 * - B is both a root and node, e.g. has bpf_rb_node and +		 *   bpf_list_head. +		 * - C is only an root, e.g. has bpf_list_node  		 * -		 * When A has both bpf_list_head and bpf_list_node, some other -		 * type already owns it in the BTF domain, hence it can not own -		 * another owning type through any of the bpf_list_head edges. +		 * When A is both a root and node, some other type already +		 * owns it in the BTF domain, hence it can not own +		 * another root type through any of the ownership edges.  		 *	A -> B  		 * Where: -		 * - B only has bpf_list_node. +		 * - A is both an root and node. +		 * - B is only an node.  		 */ -		if (meta->record->field_mask & BPF_LIST_HEAD) +		if (meta->record->field_mask & GRAPH_ROOT_MASK)  			return -ELOOP;  	}  	return 0; @@ -4476,6 +4569,7 @@ static int btf_datasec_resolve(struct btf_verifier_env *env,  	struct btf *btf = env->btf;  	u16 i; +	env->resolve_mode = RESOLVE_TBD;  	for_each_vsi_from(i, v->next_member, v->t, vsi) {  		u32 var_type_id = vsi->type, type_id, type_size = 0;  		const struct btf_type *var_type = btf_type_by_id(env->btf, @@ -5236,6 +5330,8 @@ static const char *alloc_obj_fields[] = {  	"bpf_spin_lock",  	"bpf_list_head",  	"bpf_list_node", +	"bpf_rb_root", +	"bpf_rb_node",  };  static struct btf_struct_metas * @@ -5309,7 +5405,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)  		type = &tab->types[tab->cnt];  		type->btf_id = i; -		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE, t->size); +		record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | +						  BPF_RB_ROOT | BPF_RB_NODE, t->size);  		/* The record cannot be unset, treat it as an error if so */  		if (IS_ERR_OR_NULL(record)) {  			ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; @@ -5573,6 +5670,7 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,  	if (!ctx_struct)  		/* should not happen */  		return NULL; +again:  	ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off);  	if (!ctx_tname) {  		/* should not happen */ @@ -5586,8 +5684,16 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,  	 * int socket_filter_bpf_prog(struct __sk_buff *skb)  	 * { // no fields of skb are ever used }  	 */ -	if (strcmp(ctx_tname, tname)) -		return NULL; +	if (strcmp(ctx_tname, tname)) { +		/* bpf_user_pt_regs_t is a typedef, so resolve it to +		 * underlying struct and check name again +		 */ +		if (!btf_type_is_modifier(ctx_struct)) +			return NULL; +		while (btf_type_is_modifier(ctx_struct)) +			ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type); +		goto again; +	}  	return ctx_type;  } @@ -6433,6 +6539,18 @@ static int __get_type_size(struct btf *btf, u32 btf_id,  	return -EINVAL;  } +static u8 __get_type_fmodel_flags(const struct btf_type *t) +{ +	u8 flags = 0; + +	if (__btf_type_is_struct(t)) +		flags |= BTF_FMODEL_STRUCT_ARG; +	if (btf_type_is_signed_int(t)) +		flags |= BTF_FMODEL_SIGNED_ARG; + +	return flags; +} +  int btf_distill_func_proto(struct bpf_verifier_log *log,  			   struct btf *btf,  			   const struct btf_type *func, @@ -6453,6 +6571,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,  			m->arg_flags[i] = 0;  		}  		m->ret_size = 8; +		m->ret_flags = 0;  		m->nr_args = MAX_BPF_FUNC_REG_ARGS;  		return 0;  	} @@ -6472,6 +6591,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,  		return -EINVAL;  	}  	m->ret_size = ret; +	m->ret_flags = __get_type_fmodel_flags(t);  	for (i = 0; i < nargs; i++) {  		if (i == nargs - 1 && args[i].type == 0) { @@ -6496,7 +6616,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,  			return -EINVAL;  		}  		m->arg_size[i] = ret; -		m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0; +		m->arg_flags[i] = __get_type_fmodel_flags(t);  	}  	m->nr_args = nargs;  	return 0; @@ -7260,11 +7380,14 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,  		}  		btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);  		if (IS_ERR(btf)) { -			pr_warn("failed to validate module [%s] BTF: %ld\n", -				mod->name, PTR_ERR(btf));  			kfree(btf_mod); -			if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) +			if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) { +				pr_warn("failed to validate module [%s] BTF: %ld\n", +					mod->name, PTR_ERR(btf));  				err = PTR_ERR(btf); +			} else { +				pr_warn_once("Kernel module BTF mismatch detected, BTF debug info may be unavailable for some modules\n"); +			}  			goto out;  		}  		err = btf_alloc_id(btf); @@ -7782,9 +7905,9 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c  	sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL); -	return 0;  end: -	btf_free_dtor_kfunc_tab(btf); +	if (ret) +		btf_free_dtor_kfunc_tab(btf);  	btf_put(btf);  	return ret;  } @@ -8210,3 +8333,119 @@ out:  	}  	return err;  } + +bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, +				const struct bpf_reg_state *reg, +				int off) +{ +	struct btf *btf = reg->btf; +	const struct btf_type *walk_type, *safe_type; +	const char *tname; +	char safe_tname[64]; +	long ret, safe_id; +	const struct btf_member *member, *m_walk = NULL; +	u32 i; +	const char *walk_name; + +	walk_type = btf_type_by_id(btf, reg->btf_id); +	if (!walk_type) +		return false; + +	tname = btf_name_by_offset(btf, walk_type->name_off); + +	ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname); +	if (ret < 0) +		return false; + +	safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info)); +	if (safe_id < 0) +		return false; + +	safe_type = btf_type_by_id(btf, safe_id); +	if (!safe_type) +		return false; + +	for_each_member(i, walk_type, member) { +		u32 moff; + +		/* We're looking for the PTR_TO_BTF_ID member in the struct +		 * type we're walking which matches the specified offset. +		 * Below, we'll iterate over the fields in the safe variant of +		 * the struct and see if any of them has a matching type / +		 * name. +		 */ +		moff = __btf_member_bit_offset(walk_type, member) / 8; +		if (off == moff) { +			m_walk = member; +			break; +		} +	} +	if (m_walk == NULL) +		return false; + +	walk_name = __btf_name_by_offset(btf, m_walk->name_off); +	for_each_member(i, safe_type, member) { +		const char *m_name = __btf_name_by_offset(btf, member->name_off); + +		/* If we match on both type and name, the field is considered trusted. */ +		if (m_walk->type == member->type && !strcmp(walk_name, m_name)) +			return true; +	} + +	return false; +} + +bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, +			       const struct btf *reg_btf, u32 reg_id, +			       const struct btf *arg_btf, u32 arg_id) +{ +	const char *reg_name, *arg_name, *search_needle; +	const struct btf_type *reg_type, *arg_type; +	int reg_len, arg_len, cmp_len; +	size_t pattern_len = sizeof(NOCAST_ALIAS_SUFFIX) - sizeof(char); + +	reg_type = btf_type_by_id(reg_btf, reg_id); +	if (!reg_type) +		return false; + +	arg_type = btf_type_by_id(arg_btf, arg_id); +	if (!arg_type) +		return false; + +	reg_name = btf_name_by_offset(reg_btf, reg_type->name_off); +	arg_name = btf_name_by_offset(arg_btf, arg_type->name_off); + +	reg_len = strlen(reg_name); +	arg_len = strlen(arg_name); + +	/* Exactly one of the two type names may be suffixed with ___init, so +	 * if the strings are the same size, they can't possibly be no-cast +	 * aliases of one another. If you have two of the same type names, e.g. +	 * they're both nf_conn___init, it would be improper to return true +	 * because they are _not_ no-cast aliases, they are the same type. +	 */ +	if (reg_len == arg_len) +		return false; + +	/* Either of the two names must be the other name, suffixed with ___init. */ +	if ((reg_len != arg_len + pattern_len) && +	    (arg_len != reg_len + pattern_len)) +		return false; + +	if (reg_len < arg_len) { +		search_needle = strstr(arg_name, NOCAST_ALIAS_SUFFIX); +		cmp_len = reg_len; +	} else { +		search_needle = strstr(reg_name, NOCAST_ALIAS_SUFFIX); +		cmp_len = arg_len; +	} + +	if (!search_needle) +		return false; + +	/* ___init suffix must come at the end of the name */ +	if (*(search_needle + pattern_len) != '\0') +		return false; + +	return !strncmp(reg_name, arg_name, cmp_len); +} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba3fff17e2f9..b297e9f60ca1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -34,7 +34,9 @@  #include <linux/log2.h>  #include <linux/bpf_verifier.h>  #include <linux/nodemask.h> +#include <linux/nospec.h>  #include <linux/bpf_mem_alloc.h> +#include <linux/memcontrol.h>  #include <asm/barrier.h>  #include <asm/unaligned.h> @@ -87,7 +89,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns  struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)  { -	gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; +	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);  	struct bpf_prog_aux *aux;  	struct bpf_prog *fp; @@ -96,12 +98,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag  	if (fp == NULL)  		return NULL; -	aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags); +	aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));  	if (aux == NULL) {  		vfree(fp);  		return NULL;  	} -	fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags); +	fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));  	if (!fp->active) {  		vfree(fp);  		kfree(aux); @@ -126,7 +128,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag  struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)  { -	gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; +	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);  	struct bpf_prog *prog;  	int cpu; @@ -159,7 +161,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)  	prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,  					  sizeof(*prog->aux->jited_linfo), -					  GFP_KERNEL_ACCOUNT | __GFP_NOWARN); +					  bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN));  	if (!prog->aux->jited_linfo)  		return -ENOMEM; @@ -234,7 +236,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,  struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,  				  gfp_t gfp_extra_flags)  { -	gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; +	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);  	struct bpf_prog *fp;  	u32 pages; @@ -1910,9 +1912,7 @@ out:  		 * reuse preexisting logic from Spectre v1 mitigation that  		 * happens to produce the required code on x86 for v4 as well.  		 */ -#ifdef CONFIG_X86  		barrier_nospec(); -#endif  		CONT;  #define LDST(SIZEOP, SIZE)						\  	STX_MEM_##SIZEOP:						\ @@ -2096,6 +2096,14 @@ bool bpf_prog_map_compatible(struct bpf_map *map,  	if (fp->kprobe_override)  		return false; +	/* XDP programs inserted into maps are not guaranteed to run on +	 * a particular netdev (and can run outside driver context entirely +	 * in the case of devmap and cpumap). Until device checks +	 * are implemented, prohibit adding dev-bound programs to program maps. +	 */ +	if (bpf_prog_is_dev_bound(fp->aux)) +		return false; +  	spin_lock(&map->owner.lock);  	if (!map->owner.type) {  		/* There's no owner yet where we could check for @@ -2182,7 +2190,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)  	 * valid program, which in this case would simply not  	 * be JITed, but falls back to the interpreter.  	 */ -	if (!bpf_prog_is_dev_bound(fp->aux)) { +	if (!bpf_prog_is_offloaded(fp->aux)) {  		*err = bpf_prog_alloc_jited_linfo(fp);  		if (*err)  			return fp; @@ -2554,7 +2562,7 @@ static void bpf_prog_free_deferred(struct work_struct *work)  	bpf_free_used_maps(aux);  	bpf_free_used_btfs(aux);  	if (bpf_prog_is_dev_bound(aux)) -		bpf_prog_offload_destroy(aux->prog); +		bpf_prog_dev_bound_destroy(aux->prog);  #ifdef CONFIG_PERF_EVENTS  	if (aux->prog->has_callchain_buf)  		put_callchain_buffers(); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index e0b2d016f0bf..d2110c1f6fa6 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -361,7 +361,7 @@ static int cpu_map_kthread_run(void *data)  		/* Support running another XDP prog on this CPU */  		nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);  		if (nframes) { -			m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs); +			m = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, skbs);  			if (unlikely(m == 0)) {  				for (i = 0; i < nframes; i++)  					skbs[i] = NULL; /* effect: xdp_return_frame */ diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c new file mode 100644 index 000000000000..52b981512a35 --- /dev/null +++ b/kernel/bpf/cpumask.c @@ -0,0 +1,479 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2023 Meta, Inc */ +#include <linux/bpf.h> +#include <linux/bpf_mem_alloc.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <linux/cpumask.h> + +/** + * struct bpf_cpumask - refcounted BPF cpumask wrapper structure + * @cpumask:	The actual cpumask embedded in the struct. + * @usage:	Object reference counter. When the refcount goes to 0, the + *		memory is released back to the BPF allocator, which provides + *		RCU safety. + * + * Note that we explicitly embed a cpumask_t rather than a cpumask_var_t.  This + * is done to avoid confusing the verifier due to the typedef of cpumask_var_t + * changing depending on whether CONFIG_CPUMASK_OFFSTACK is defined or not. See + * the details in <linux/cpumask.h>. The consequence is that this structure is + * likely a bit larger than it needs to be when CONFIG_CPUMASK_OFFSTACK is + * defined due to embedding the whole NR_CPUS-size bitmap, but the extra memory + * overhead is minimal. For the more typical case of CONFIG_CPUMASK_OFFSTACK + * not being defined, the structure is the same size regardless. + */ +struct bpf_cpumask { +	cpumask_t cpumask; +	refcount_t usage; +}; + +static struct bpf_mem_alloc bpf_cpumask_ma; + +static bool cpu_valid(u32 cpu) +{ +	return cpu < nr_cpu_ids; +} + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", +		  "Global kfuncs as their definitions will be in BTF"); + +/** + * bpf_cpumask_create() - Create a mutable BPF cpumask. + * + * Allocates a cpumask that can be queried, mutated, acquired, and released by + * a BPF program. The cpumask returned by this function must either be embedded + * in a map as a kptr, or freed with bpf_cpumask_release(). + * + * bpf_cpumask_create() allocates memory using the BPF memory allocator, and + * will not block. It may return NULL if no memory is available. + */ +__bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) +{ +	struct bpf_cpumask *cpumask; + +	/* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */ +	BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0); + +	cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask)); +	if (!cpumask) +		return NULL; + +	memset(cpumask, 0, sizeof(*cpumask)); +	refcount_set(&cpumask->usage, 1); + +	return cpumask; +} + +/** + * bpf_cpumask_acquire() - Acquire a reference to a BPF cpumask. + * @cpumask: The BPF cpumask being acquired. The cpumask must be a trusted + *	     pointer. + * + * Acquires a reference to a BPF cpumask. The cpumask returned by this function + * must either be embedded in a map as a kptr, or freed with + * bpf_cpumask_release(). + */ +__bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) +{ +	refcount_inc(&cpumask->usage); +	return cpumask; +} + +/** + * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask + *			    stored in a map. + * @cpumaskp: A pointer to a BPF cpumask map value. + * + * Attempts to acquire a reference to a BPF cpumask stored in a map value. The + * cpumask returned by this function must either be embedded in a map as a + * kptr, or freed with bpf_cpumask_release(). This function may return NULL if + * no BPF cpumask was found in the specified map value. + */ +__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp) +{ +	struct bpf_cpumask *cpumask; + +	/* The BPF memory allocator frees memory backing its caches in an RCU +	 * callback. Thus, we can safely use RCU to ensure that the cpumask is +	 * safe to read. +	 */ +	rcu_read_lock(); + +	cpumask = READ_ONCE(*cpumaskp); +	if (cpumask && !refcount_inc_not_zero(&cpumask->usage)) +		cpumask = NULL; + +	rcu_read_unlock(); +	return cpumask; +} + +/** + * bpf_cpumask_release() - Release a previously acquired BPF cpumask. + * @cpumask: The cpumask being released. + * + * Releases a previously acquired reference to a BPF cpumask. When the final + * reference of the BPF cpumask has been released, it is subsequently freed in + * an RCU callback in the BPF memory allocator. + */ +__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) +{ +	if (!cpumask) +		return; + +	if (refcount_dec_and_test(&cpumask->usage)) { +		migrate_disable(); +		bpf_mem_free(&bpf_cpumask_ma, cpumask); +		migrate_enable(); +	} +} + +/** + * bpf_cpumask_first() - Get the index of the first nonzero bit in the cpumask. + * @cpumask: The cpumask being queried. + * + * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask + * pointer may be safely passed to this function. + */ +__bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) +{ +	return cpumask_first(cpumask); +} + +/** + * bpf_cpumask_first_zero() - Get the index of the first unset bit in the + *			      cpumask. + * @cpumask: The cpumask being queried. + * + * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask + * pointer may be safely passed to this function. + */ +__bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) +{ +	return cpumask_first_zero(cpumask); +} + +/** + * bpf_cpumask_set_cpu() - Set a bit for a CPU in a BPF cpumask. + * @cpu: The CPU to be set in the cpumask. + * @cpumask: The BPF cpumask in which a bit is being set. + */ +__bpf_kfunc void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ +	if (!cpu_valid(cpu)) +		return; + +	cpumask_set_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_clear_cpu() - Clear a bit for a CPU in a BPF cpumask. + * @cpu: The CPU to be cleared from the cpumask. + * @cpumask: The BPF cpumask in which a bit is being cleared. + */ +__bpf_kfunc void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ +	if (!cpu_valid(cpu)) +		return; + +	cpumask_clear_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_test_cpu() - Test whether a CPU is set in a cpumask. + * @cpu: The CPU being queried for. + * @cpumask: The cpumask being queried for containing a CPU. + * + * Return: + * * true  - @cpu is set in the cpumask + * * false - @cpu was not set in the cpumask, or @cpu is an invalid cpu. + */ +__bpf_kfunc bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) +{ +	if (!cpu_valid(cpu)) +		return false; + +	return cpumask_test_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_test_and_set_cpu() - Atomically test and set a CPU in a BPF cpumask. + * @cpu: The CPU being set and queried for. + * @cpumask: The BPF cpumask being set and queried for containing a CPU. + * + * Return: + * * true  - @cpu is set in the cpumask + * * false - @cpu was not set in the cpumask, or @cpu is invalid. + */ +__bpf_kfunc bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ +	if (!cpu_valid(cpu)) +		return false; + +	return cpumask_test_and_set_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_test_and_clear_cpu() - Atomically test and clear a CPU in a BPF + *				      cpumask. + * @cpu: The CPU being cleared and queried for. + * @cpumask: The BPF cpumask being cleared and queried for containing a CPU. + * + * Return: + * * true  - @cpu is set in the cpumask + * * false - @cpu was not set in the cpumask, or @cpu is invalid. + */ +__bpf_kfunc bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) +{ +	if (!cpu_valid(cpu)) +		return false; + +	return cpumask_test_and_clear_cpu(cpu, (struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_setall() - Set all of the bits in a BPF cpumask. + * @cpumask: The BPF cpumask having all of its bits set. + */ +__bpf_kfunc void bpf_cpumask_setall(struct bpf_cpumask *cpumask) +{ +	cpumask_setall((struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_clear() - Clear all of the bits in a BPF cpumask. + * @cpumask: The BPF cpumask being cleared. + */ +__bpf_kfunc void bpf_cpumask_clear(struct bpf_cpumask *cpumask) +{ +	cpumask_clear((struct cpumask *)cpumask); +} + +/** + * bpf_cpumask_and() - AND two cpumasks and store the result. + * @dst: The BPF cpumask where the result is being stored. + * @src1: The first input. + * @src2: The second input. + * + * Return: + * * true  - @dst has at least one bit set following the operation + * * false - @dst is empty following the operation + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +__bpf_kfunc bool bpf_cpumask_and(struct bpf_cpumask *dst, +				 const struct cpumask *src1, +				 const struct cpumask *src2) +{ +	return cpumask_and((struct cpumask *)dst, src1, src2); +} + +/** + * bpf_cpumask_or() - OR two cpumasks and store the result. + * @dst: The BPF cpumask where the result is being stored. + * @src1: The first input. + * @src2: The second input. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +__bpf_kfunc void bpf_cpumask_or(struct bpf_cpumask *dst, +				const struct cpumask *src1, +				const struct cpumask *src2) +{ +	cpumask_or((struct cpumask *)dst, src1, src2); +} + +/** + * bpf_cpumask_xor() - XOR two cpumasks and store the result. + * @dst: The BPF cpumask where the result is being stored. + * @src1: The first input. + * @src2: The second input. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +__bpf_kfunc void bpf_cpumask_xor(struct bpf_cpumask *dst, +				 const struct cpumask *src1, +				 const struct cpumask *src2) +{ +	cpumask_xor((struct cpumask *)dst, src1, src2); +} + +/** + * bpf_cpumask_equal() - Check two cpumasks for equality. + * @src1: The first input. + * @src2: The second input. + * + * Return: + * * true   - @src1 and @src2 have the same bits set. + * * false  - @src1 and @src2 differ in at least one bit. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +__bpf_kfunc bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) +{ +	return cpumask_equal(src1, src2); +} + +/** + * bpf_cpumask_intersects() - Check two cpumasks for overlap. + * @src1: The first input. + * @src2: The second input. + * + * Return: + * * true   - @src1 and @src2 have at least one of the same bits set. + * * false  - @src1 and @src2 don't have any of the same bits set. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +__bpf_kfunc bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) +{ +	return cpumask_intersects(src1, src2); +} + +/** + * bpf_cpumask_subset() - Check if a cpumask is a subset of another. + * @src1: The first cpumask being checked as a subset. + * @src2: The second cpumask being checked as a superset. + * + * Return: + * * true   - All of the bits of @src1 are set in @src2. + * * false  - At least one bit in @src1 is not set in @src2. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +__bpf_kfunc bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) +{ +	return cpumask_subset(src1, src2); +} + +/** + * bpf_cpumask_empty() - Check if a cpumask is empty. + * @cpumask: The cpumask being checked. + * + * Return: + * * true   - None of the bits in @cpumask are set. + * * false  - At least one bit in @cpumask is set. + * + * A struct bpf_cpumask pointer may be safely passed to @cpumask. + */ +__bpf_kfunc bool bpf_cpumask_empty(const struct cpumask *cpumask) +{ +	return cpumask_empty(cpumask); +} + +/** + * bpf_cpumask_full() - Check if a cpumask has all bits set. + * @cpumask: The cpumask being checked. + * + * Return: + * * true   - All of the bits in @cpumask are set. + * * false  - At least one bit in @cpumask is cleared. + * + * A struct bpf_cpumask pointer may be safely passed to @cpumask. + */ +__bpf_kfunc bool bpf_cpumask_full(const struct cpumask *cpumask) +{ +	return cpumask_full(cpumask); +} + +/** + * bpf_cpumask_copy() - Copy the contents of a cpumask into a BPF cpumask. + * @dst: The BPF cpumask being copied into. + * @src: The cpumask being copied. + * + * A struct bpf_cpumask pointer may be safely passed to @src. + */ +__bpf_kfunc void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) +{ +	cpumask_copy((struct cpumask *)dst, src); +} + +/** + * bpf_cpumask_any() - Return a random set CPU from a cpumask. + * @cpumask: The cpumask being queried. + * + * Return: + * * A random set bit within [0, num_cpus) if at least one bit is set. + * * >= num_cpus if no bit is set. + * + * A struct bpf_cpumask pointer may be safely passed to @src. + */ +__bpf_kfunc u32 bpf_cpumask_any(const struct cpumask *cpumask) +{ +	return cpumask_any(cpumask); +} + +/** + * bpf_cpumask_any_and() - Return a random set CPU from the AND of two + *			   cpumasks. + * @src1: The first cpumask. + * @src2: The second cpumask. + * + * Return: + * * A random set bit within [0, num_cpus) if at least one bit is set. + * * >= num_cpus if no bit is set. + * + * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + */ +__bpf_kfunc u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) +{ +	return cpumask_any_and(src1, src2); +} + +__diag_pop(); + +BTF_SET8_START(cpumask_kfunc_btf_ids) +BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS) +BTF_SET8_END(cpumask_kfunc_btf_ids) + +static const struct btf_kfunc_id_set cpumask_kfunc_set = { +	.owner = THIS_MODULE, +	.set   = &cpumask_kfunc_btf_ids, +}; + +BTF_ID_LIST(cpumask_dtor_ids) +BTF_ID(struct, bpf_cpumask) +BTF_ID(func, bpf_cpumask_release) + +static int __init cpumask_kfunc_init(void) +{ +	int ret; +	const struct btf_id_dtor_kfunc cpumask_dtors[] = { +		{ +			.btf_id	      = cpumask_dtor_ids[0], +			.kfunc_btf_id = cpumask_dtor_ids[1] +		}, +	}; + +	ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false); +	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set); +	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set); +	return  ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors, +						   ARRAY_SIZE(cpumask_dtors), +						   THIS_MODULE); +} + +late_initcall(cpumask_kfunc_init); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index d01e4c55b376..2675fefc6cb6 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -474,7 +474,11 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,  {  	int err; -	if (!dev->netdev_ops->ndo_xdp_xmit) +	if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) +		return -EOPNOTSUPP; + +	if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && +		     xdp_frame_has_frags(xdpf)))  		return -EOPNOTSUPP;  	err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); @@ -532,8 +536,14 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,  static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)  { -	if (!obj || -	    !obj->dev->netdev_ops->ndo_xdp_xmit) +	if (!obj) +		return false; + +	if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) +		return false; + +	if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && +		     xdp_frame_has_frags(xdpf)))  		return false;  	if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5aa2b5525f79..5dfcb5ad0d06 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -152,7 +152,7 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,  {  	unsigned long flags; -	hash = hash & HASHTAB_MAP_LOCK_MASK; +	hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);  	preempt_disable();  	if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { @@ -171,7 +171,7 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,  				      struct bucket *b, u32 hash,  				      unsigned long flags)  { -	hash = hash & HASHTAB_MAP_LOCK_MASK; +	hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1);  	raw_spin_unlock_irqrestore(&b->raw_lock, flags);  	__this_cpu_dec(*(htab->map_locked[hash]));  	preempt_enable(); @@ -1004,8 +1004,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,  			l_new = ERR_PTR(-ENOMEM);  			goto dec_count;  		} -		check_and_init_map_value(&htab->map, -					 l_new->key + round_up(key_size, 8));  	}  	memcpy(l_new->key, key, key_size); @@ -1592,6 +1590,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,  			else  				copy_map_value(map, value, l->key +  					       roundup_key_size); +			/* Zeroing special fields in the temp buffer */  			check_and_init_map_value(map, value);  		} @@ -1792,6 +1791,7 @@ again_nocopy:  						      true);  			else  				copy_map_value(map, dst_val, value); +			/* Zeroing special fields in the temp buffer */  			check_and_init_map_value(map, dst_val);  		}  		if (do_delete) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index af30c6cbd65d..5b278a38ae58 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -756,19 +756,20 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,  /* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary   * arguments representation.   */ -#define MAX_BPRINTF_BUF_LEN	512 +#define MAX_BPRINTF_BIN_ARGS	512  /* Support executing three nested bprintf helper calls on a given CPU */  #define MAX_BPRINTF_NEST_LEVEL	3  struct bpf_bprintf_buffers { -	char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN]; +	char bin_args[MAX_BPRINTF_BIN_ARGS]; +	char buf[MAX_BPRINTF_BUF];  }; -static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs); + +static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs);  static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); -static int try_get_fmt_tmp_buf(char **tmp_buf) +static int try_get_buffers(struct bpf_bprintf_buffers **bufs)  { -	struct bpf_bprintf_buffers *bufs;  	int nest_level;  	preempt_disable(); @@ -778,18 +779,19 @@ static int try_get_fmt_tmp_buf(char **tmp_buf)  		preempt_enable();  		return -EBUSY;  	} -	bufs = this_cpu_ptr(&bpf_bprintf_bufs); -	*tmp_buf = bufs->tmp_bufs[nest_level - 1]; +	*bufs = this_cpu_ptr(&bpf_bprintf_bufs[nest_level - 1]);  	return 0;  } -void bpf_bprintf_cleanup(void) +void bpf_bprintf_cleanup(struct bpf_bprintf_data *data)  { -	if (this_cpu_read(bpf_bprintf_nest_level)) { -		this_cpu_dec(bpf_bprintf_nest_level); -		preempt_enable(); -	} +	if (!data->bin_args && !data->buf) +		return; +	if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) +		return; +	this_cpu_dec(bpf_bprintf_nest_level); +	preempt_enable();  }  /* @@ -798,18 +800,20 @@ void bpf_bprintf_cleanup(void)   * Returns a negative value if fmt is an invalid format string or 0 otherwise.   *   * This can be used in two ways: - * - Format string verification only: when bin_args is NULL + * - Format string verification only: when data->get_bin_args is false   * - Arguments preparation: in addition to the above verification, it writes in - *   bin_args a binary representation of arguments usable by bstr_printf where - *   pointers from BPF have been sanitized. + *   data->bin_args a binary representation of arguments usable by bstr_printf + *   where pointers from BPF have been sanitized.   *   * In argument preparation mode, if 0 is returned, safe temporary buffers are   * allocated and bpf_bprintf_cleanup should be called to free them after use.   */  int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, -			u32 **bin_args, u32 num_args) +			u32 num_args, struct bpf_bprintf_data *data)  { +	bool get_buffers = (data->get_bin_args && num_args) || data->get_buf;  	char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; +	struct bpf_bprintf_buffers *buffers = NULL;  	size_t sizeof_cur_arg, sizeof_cur_ip;  	int err, i, num_spec = 0;  	u64 cur_arg; @@ -820,14 +824,19 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,  		return -EINVAL;  	fmt_size = fmt_end - fmt; -	if (bin_args) { -		if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) -			return -EBUSY; +	if (get_buffers && try_get_buffers(&buffers)) +		return -EBUSY; -		tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN; -		*bin_args = (u32 *)tmp_buf; +	if (data->get_bin_args) { +		if (num_args) +			tmp_buf = buffers->bin_args; +		tmp_buf_end = tmp_buf + MAX_BPRINTF_BIN_ARGS; +		data->bin_args = (u32 *)tmp_buf;  	} +	if (data->get_buf) +		data->buf = buffers->buf; +  	for (i = 0; i < fmt_size; i++) {  		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) {  			err = -EINVAL; @@ -1021,31 +1030,33 @@ nocopy_fmt:  	err = 0;  out:  	if (err) -		bpf_bprintf_cleanup(); +		bpf_bprintf_cleanup(data);  	return err;  }  BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, -	   const void *, data, u32, data_len) +	   const void *, args, u32, data_len)  { +	struct bpf_bprintf_data data = { +		.get_bin_args	= true, +	};  	int err, num_args; -	u32 *bin_args;  	if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 || -	    (data_len && !data)) +	    (data_len && !args))  		return -EINVAL;  	num_args = data_len / 8;  	/* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we  	 * can safely give an unbounded size.  	 */ -	err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args); +	err = bpf_bprintf_prepare(fmt, UINT_MAX, args, num_args, &data);  	if (err < 0)  		return err; -	err = bstr_printf(str, str_size, fmt, bin_args); +	err = bstr_printf(str, str_size, fmt, data.bin_args); -	bpf_bprintf_cleanup(); +	bpf_bprintf_cleanup(&data);  	return err + 1;  } @@ -1745,12 +1756,12 @@ unlock:  	while (head != orig_head) {  		void *obj = head; -		obj -= field->list_head.node_offset; +		obj -= field->graph_root.node_offset;  		head = head->next;  		/* The contained type can also have resources, including a  		 * bpf_list_head which needs to be freed.  		 */ -		bpf_obj_free_fields(field->list_head.value_rec, obj); +		bpf_obj_free_fields(field->graph_root.value_rec, obj);  		/* bpf_mem_free requires migrate_disable(), since we can be  		 * called from map free path as well apart from BPF program (as  		 * part of map ops doing bpf_obj_free_fields). @@ -1761,11 +1772,51 @@ unlock:  	}  } +/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are + * 'rb_node *', so field name of rb_node within containing struct is not + * needed. + * + * Since bpf_rb_tree's node type has a corresponding struct btf_field with + * graph_root.node_offset, it's not necessary to know field name + * or type of node struct + */ +#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \ +	for (pos = rb_first_postorder(root); \ +	    pos && ({ n = rb_next_postorder(pos); 1; }); \ +	    pos = n) + +void bpf_rb_root_free(const struct btf_field *field, void *rb_root, +		      struct bpf_spin_lock *spin_lock) +{ +	struct rb_root_cached orig_root, *root = rb_root; +	struct rb_node *pos, *n; +	void *obj; + +	BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root)); +	BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root)); + +	__bpf_spin_lock_irqsave(spin_lock); +	orig_root = *root; +	*root = RB_ROOT_CACHED; +	__bpf_spin_unlock_irqrestore(spin_lock); + +	bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { +		obj = pos; +		obj -= field->graph_root.node_offset; + +		bpf_obj_free_fields(field->graph_root.value_rec, obj); + +		migrate_disable(); +		bpf_mem_free(&bpf_global_ma, obj); +		migrate_enable(); +	} +} +  __diag_push();  __diag_ignore_all("-Wmissing-prototypes",  		  "Global functions as their definitions will be in vmlinux BTF"); -void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) +__bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)  {  	struct btf_struct_meta *meta = meta__ign;  	u64 size = local_type_id__k; @@ -1779,7 +1830,7 @@ void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)  	return p;  } -void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) +__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)  {  	struct btf_struct_meta *meta = meta__ign;  	void *p = p__alloc; @@ -1800,12 +1851,12 @@ static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *hea  	tail ? list_add_tail(n, h) : list_add(n, h);  } -void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) +__bpf_kfunc void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)  {  	return __bpf_list_add(node, head, false);  } -void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) +__bpf_kfunc void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)  {  	return __bpf_list_add(node, head, true);  } @@ -1823,23 +1874,73 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai  	return (struct bpf_list_node *)n;  } -struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) +__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)  {  	return __bpf_list_del(head, false);  } -struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) +__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)  {  	return __bpf_list_del(head, true);  } +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, +						  struct bpf_rb_node *node) +{ +	struct rb_root_cached *r = (struct rb_root_cached *)root; +	struct rb_node *n = (struct rb_node *)node; + +	rb_erase_cached(n, r); +	RB_CLEAR_NODE(n); +	return (struct bpf_rb_node *)n; +} + +/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF + * program + */ +static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, +			     void *less) +{ +	struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; +	bpf_callback_t cb = (bpf_callback_t)less; +	struct rb_node *parent = NULL; +	bool leftmost = true; + +	while (*link) { +		parent = *link; +		if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) { +			link = &parent->rb_left; +		} else { +			link = &parent->rb_right; +			leftmost = false; +		} +	} + +	rb_link_node((struct rb_node *)node, parent, link); +	rb_insert_color_cached((struct rb_node *)node, +			       (struct rb_root_cached *)root, leftmost); +} + +__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, +				bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) +{ +	__bpf_rbtree_add(root, node, (void *)less); +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) +{ +	struct rb_root_cached *r = (struct rb_root_cached *)root; + +	return (struct bpf_rb_node *)rb_first_cached(r); +} +  /**   * bpf_task_acquire - Acquire a reference to a task. A task acquired by this   * kfunc which is not stored in a map as a kptr, must be released by calling   * bpf_task_release().   * @p: The task on which a reference is being acquired.   */ -struct task_struct *bpf_task_acquire(struct task_struct *p) +__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)  {  	return get_task_struct(p);  } @@ -1850,7 +1951,7 @@ struct task_struct *bpf_task_acquire(struct task_struct *p)   * released by calling bpf_task_release().   * @p: The task on which a reference is being acquired.   */ -struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) +__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)  {  	/* For the time being this function returns NULL, as it's not currently  	 * possible to safely acquire a reference to a task with RCU protection @@ -1902,7 +2003,7 @@ struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)   * be released by calling bpf_task_release().   * @pp: A pointer to a task kptr on which a reference is being acquired.   */ -struct task_struct *bpf_task_kptr_get(struct task_struct **pp) +__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)  {  	/* We must return NULL here until we have clarity on how to properly  	 * leverage RCU for ensuring a task's lifetime. See the comment above @@ -1915,7 +2016,7 @@ struct task_struct *bpf_task_kptr_get(struct task_struct **pp)   * bpf_task_release - Release the reference acquired on a task.   * @p: The task on which a reference is being released.   */ -void bpf_task_release(struct task_struct *p) +__bpf_kfunc void bpf_task_release(struct task_struct *p)  {  	if (!p)  		return; @@ -1930,7 +2031,7 @@ void bpf_task_release(struct task_struct *p)   * calling bpf_cgroup_release().   * @cgrp: The cgroup on which a reference is being acquired.   */ -struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) +__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)  {  	cgroup_get(cgrp);  	return cgrp; @@ -1942,7 +2043,7 @@ struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)   * be released by calling bpf_cgroup_release().   * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired.   */ -struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) +__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)  {  	struct cgroup *cgrp; @@ -1974,7 +2075,7 @@ struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)   * drops to 0.   * @cgrp: The cgroup on which a reference is being released.   */ -void bpf_cgroup_release(struct cgroup *cgrp) +__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)  {  	if (!cgrp)  		return; @@ -1989,7 +2090,7 @@ void bpf_cgroup_release(struct cgroup *cgrp)   * @cgrp: The cgroup for which we're performing a lookup.   * @level: The level of ancestor to look up.   */ -struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) +__bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)  {  	struct cgroup *ancestor; @@ -2008,7 +2109,7 @@ struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)   * stored in a map, or released with bpf_task_release().   * @pid: The pid of the task being looked up.   */ -struct task_struct *bpf_task_from_pid(s32 pid) +__bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)  {  	struct task_struct *p; @@ -2021,22 +2122,22 @@ struct task_struct *bpf_task_from_pid(s32 pid)  	return p;  } -void *bpf_cast_to_kern_ctx(void *obj) +__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)  {  	return obj;  } -void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k) +__bpf_kfunc void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)  {  	return obj__ign;  } -void bpf_rcu_read_lock(void) +__bpf_kfunc void bpf_rcu_read_lock(void)  {  	rcu_read_lock();  } -void bpf_rcu_read_unlock(void) +__bpf_kfunc void bpf_rcu_read_unlock(void)  {  	rcu_read_unlock();  } @@ -2057,6 +2158,10 @@ BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)  BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)  BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)  BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_rbtree_add) +BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) +  #ifdef CONFIG_CGROUPS  BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)  BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4f841e16779e..9948b542a470 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -122,7 +122,7 @@ static struct inode *bpf_get_inode(struct super_block *sb,  	inode->i_mtime = inode->i_atime;  	inode->i_ctime = inode->i_atime; -	inode_init_owner(&init_user_ns, inode, dir, mode); +	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);  	return inode;  } @@ -152,7 +152,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,  	dir->i_ctime = dir->i_mtime;  } -static int bpf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, +static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,  		     struct dentry *dentry, umode_t mode)  {  	struct inode *inode; @@ -382,7 +382,7 @@ bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)  	return simple_lookup(dir, dentry, flags);  } -static int bpf_symlink(struct user_namespace *mnt_userns, struct inode *dir, +static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,  		       struct dentry *dentry, const char *target)  {  	char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); @@ -559,7 +559,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags)  static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)  {  	struct bpf_prog *prog; -	int ret = inode_permission(&init_user_ns, inode, MAY_READ); +	int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);  	if (ret)  		return ERR_PTR(ret); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index ebcc3dd0fa19..5fcdacbb8439 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -71,7 +71,7 @@ static int bpf_mem_cache_idx(size_t size)  	if (size <= 192)  		return size_index[(size - 1) / 8] - 1; -	return fls(size - 1) - 1; +	return fls(size - 1) - 2;  }  #define NUM_CACHES 11 @@ -143,7 +143,7 @@ static void *__alloc(struct bpf_mem_cache *c, int node)  		return obj;  	} -	return kmalloc_node(c->unit_size, flags, node); +	return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);  }  static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) @@ -395,7 +395,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)  		unit_size = size;  #ifdef CONFIG_MEMCG_KMEM -		objcg = get_obj_cgroup_from_current(); +		if (memcg_bpf_enabled()) +			objcg = get_obj_cgroup_from_current();  #endif  		for_each_possible_cpu(cpu) {  			c = per_cpu_ptr(pc, cpu); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 13e4efc971e6..0c85e06f7ea7 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -41,7 +41,7 @@ struct bpf_offload_dev {  struct bpf_offload_netdev {  	struct rhash_head l;  	struct net_device *netdev; -	struct bpf_offload_dev *offdev; +	struct bpf_offload_dev *offdev; /* NULL when bound-only */  	struct list_head progs;  	struct list_head maps;  	struct list_head offdev_netdevs; @@ -56,7 +56,6 @@ static const struct rhashtable_params offdevs_params = {  };  static struct rhashtable offdevs; -static bool offdevs_inited;  static int bpf_dev_offload_check(struct net_device *netdev)  { @@ -72,58 +71,218 @@ bpf_offload_find_netdev(struct net_device *netdev)  {  	lockdep_assert_held(&bpf_devs_lock); -	if (!offdevs_inited) -		return NULL;  	return rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params);  } -int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) +static int __bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, +					     struct net_device *netdev)  {  	struct bpf_offload_netdev *ondev; -	struct bpf_prog_offload *offload;  	int err; -	if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && -	    attr->prog_type != BPF_PROG_TYPE_XDP) -		return -EINVAL; +	ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); +	if (!ondev) +		return -ENOMEM; -	if (attr->prog_flags) -		return -EINVAL; +	ondev->netdev = netdev; +	ondev->offdev = offdev; +	INIT_LIST_HEAD(&ondev->progs); +	INIT_LIST_HEAD(&ondev->maps); + +	err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); +	if (err) { +		netdev_warn(netdev, "failed to register for BPF offload\n"); +		goto err_free; +	} + +	if (offdev) +		list_add(&ondev->offdev_netdevs, &offdev->netdevs); +	return 0; + +err_free: +	kfree(ondev); +	return err; +} + +static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +{ +	struct bpf_prog_offload *offload = prog->aux->offload; + +	if (offload->dev_state) +		offload->offdev->ops->destroy(prog); + +	list_del_init(&offload->offloads); +	kfree(offload); +	prog->aux->offload = NULL; +} + +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, +			       enum bpf_netdev_command cmd) +{ +	struct netdev_bpf data = {}; +	struct net_device *netdev; + +	ASSERT_RTNL(); + +	data.command = cmd; +	data.offmap = offmap; +	/* Caller must make sure netdev is valid */ +	netdev = offmap->netdev; + +	return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ +	WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); +	/* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ +	bpf_map_free_id(&offmap->map); +	list_del_init(&offmap->offloads); +	offmap->netdev = NULL; +} + +static void __bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, +						struct net_device *netdev) +{ +	struct bpf_offload_netdev *ondev, *altdev = NULL; +	struct bpf_offloaded_map *offmap, *mtmp; +	struct bpf_prog_offload *offload, *ptmp; + +	ASSERT_RTNL(); + +	ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); +	if (WARN_ON(!ondev)) +		return; + +	WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); + +	/* Try to move the objects to another netdev of the device */ +	if (offdev) { +		list_del(&ondev->offdev_netdevs); +		altdev = list_first_entry_or_null(&offdev->netdevs, +						  struct bpf_offload_netdev, +						  offdev_netdevs); +	} + +	if (altdev) { +		list_for_each_entry(offload, &ondev->progs, offloads) +			offload->netdev = altdev->netdev; +		list_splice_init(&ondev->progs, &altdev->progs); + +		list_for_each_entry(offmap, &ondev->maps, offloads) +			offmap->netdev = altdev->netdev; +		list_splice_init(&ondev->maps, &altdev->maps); +	} else { +		list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) +			__bpf_prog_offload_destroy(offload->prog); +		list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) +			__bpf_map_offload_destroy(offmap); +	} + +	WARN_ON(!list_empty(&ondev->progs)); +	WARN_ON(!list_empty(&ondev->maps)); +	kfree(ondev); +} + +static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *netdev) +{ +	struct bpf_offload_netdev *ondev; +	struct bpf_prog_offload *offload; +	int err;  	offload = kzalloc(sizeof(*offload), GFP_USER);  	if (!offload)  		return -ENOMEM;  	offload->prog = prog; +	offload->netdev = netdev; -	offload->netdev = dev_get_by_index(current->nsproxy->net_ns, -					   attr->prog_ifindex); -	err = bpf_dev_offload_check(offload->netdev); -	if (err) -		goto err_maybe_put; - -	down_write(&bpf_devs_lock);  	ondev = bpf_offload_find_netdev(offload->netdev);  	if (!ondev) { -		err = -EINVAL; -		goto err_unlock; +		if (bpf_prog_is_offloaded(prog->aux)) { +			err = -EINVAL; +			goto err_free; +		} + +		/* When only binding to the device, explicitly +		 * create an entry in the hashtable. +		 */ +		err = __bpf_offload_dev_netdev_register(NULL, offload->netdev); +		if (err) +			goto err_free; +		ondev = bpf_offload_find_netdev(offload->netdev);  	}  	offload->offdev = ondev->offdev;  	prog->aux->offload = offload;  	list_add_tail(&offload->offloads, &ondev->progs); -	dev_put(offload->netdev); -	up_write(&bpf_devs_lock);  	return 0; -err_unlock: -	up_write(&bpf_devs_lock); -err_maybe_put: -	if (offload->netdev) -		dev_put(offload->netdev); +err_free:  	kfree(offload);  	return err;  } +int bpf_prog_dev_bound_init(struct bpf_prog *prog, union bpf_attr *attr) +{ +	struct net_device *netdev; +	int err; + +	if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && +	    attr->prog_type != BPF_PROG_TYPE_XDP) +		return -EINVAL; + +	if (attr->prog_flags & ~BPF_F_XDP_DEV_BOUND_ONLY) +		return -EINVAL; + +	if (attr->prog_type == BPF_PROG_TYPE_SCHED_CLS && +	    attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY) +		return -EINVAL; + +	netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); +	if (!netdev) +		return -EINVAL; + +	err = bpf_dev_offload_check(netdev); +	if (err) +		goto out; + +	prog->aux->offload_requested = !(attr->prog_flags & BPF_F_XDP_DEV_BOUND_ONLY); + +	down_write(&bpf_devs_lock); +	err = __bpf_prog_dev_bound_init(prog, netdev); +	up_write(&bpf_devs_lock); + +out: +	dev_put(netdev); +	return err; +} + +int bpf_prog_dev_bound_inherit(struct bpf_prog *new_prog, struct bpf_prog *old_prog) +{ +	int err; + +	if (!bpf_prog_is_dev_bound(old_prog->aux)) +		return 0; + +	if (bpf_prog_is_offloaded(old_prog->aux)) +		return -EINVAL; + +	new_prog->aux->dev_bound = old_prog->aux->dev_bound; +	new_prog->aux->offload_requested = old_prog->aux->offload_requested; + +	down_write(&bpf_devs_lock); +	if (!old_prog->aux->offload) { +		err = -EINVAL; +		goto out; +	} + +	err = __bpf_prog_dev_bound_init(new_prog, old_prog->aux->offload->netdev); + +out: +	up_write(&bpf_devs_lock); +	return err; +} +  int bpf_prog_offload_verifier_prep(struct bpf_prog *prog)  {  	struct bpf_prog_offload *offload; @@ -209,27 +368,25 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)  	up_read(&bpf_devs_lock);  } -static void __bpf_prog_offload_destroy(struct bpf_prog *prog) +void bpf_prog_dev_bound_destroy(struct bpf_prog *prog)  { -	struct bpf_prog_offload *offload = prog->aux->offload; - -	if (offload->dev_state) -		offload->offdev->ops->destroy(prog); - -	/* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ -	bpf_prog_free_id(prog, true); - -	list_del_init(&offload->offloads); -	kfree(offload); -	prog->aux->offload = NULL; -} +	struct bpf_offload_netdev *ondev; +	struct net_device *netdev; -void bpf_prog_offload_destroy(struct bpf_prog *prog) -{ +	rtnl_lock();  	down_write(&bpf_devs_lock); -	if (prog->aux->offload) +	if (prog->aux->offload) { +		list_del_init(&prog->aux->offload->offloads); + +		netdev = prog->aux->offload->netdev;  		__bpf_prog_offload_destroy(prog); + +		ondev = bpf_offload_find_netdev(netdev); +		if (!ondev->offdev && list_empty(&ondev->progs)) +			__bpf_offload_dev_netdev_unregister(NULL, netdev); +	}  	up_write(&bpf_devs_lock); +	rtnl_unlock();  }  static int bpf_prog_offload_translate(struct bpf_prog *prog) @@ -343,22 +500,6 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,  const struct bpf_prog_ops bpf_offload_prog_ops = {  }; -static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, -			       enum bpf_netdev_command cmd) -{ -	struct netdev_bpf data = {}; -	struct net_device *netdev; - -	ASSERT_RTNL(); - -	data.command = cmd; -	data.offmap = offmap; -	/* Caller must make sure netdev is valid */ -	netdev = offmap->netdev; - -	return netdev->netdev_ops->ndo_bpf(netdev, &data); -} -  struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)  {  	struct net *net = current->nsproxy->net_ns; @@ -408,15 +549,6 @@ err_unlock:  	return ERR_PTR(err);  } -static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) -{ -	WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); -	/* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ -	bpf_map_free_id(&offmap->map, true); -	list_del_init(&offmap->offloads); -	offmap->netdev = NULL; -} -  void bpf_map_offload_map_free(struct bpf_map *map)  {  	struct bpf_offloaded_map *offmap = map_to_offmap(map); @@ -576,12 +708,28 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct net_device *netdev)  }  EXPORT_SYMBOL_GPL(bpf_offload_dev_match); +bool bpf_prog_dev_bound_match(const struct bpf_prog *lhs, const struct bpf_prog *rhs) +{ +	bool ret; + +	if (bpf_prog_is_offloaded(lhs->aux) != bpf_prog_is_offloaded(rhs->aux)) +		return false; + +	down_read(&bpf_devs_lock); +	ret = lhs->aux->offload && rhs->aux->offload && +	      lhs->aux->offload->netdev && +	      lhs->aux->offload->netdev == rhs->aux->offload->netdev; +	up_read(&bpf_devs_lock); + +	return ret; +} +  bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)  {  	struct bpf_offloaded_map *offmap;  	bool ret; -	if (!bpf_map_is_dev_bound(map)) +	if (!bpf_map_is_offloaded(map))  		return bpf_map_offload_neutral(map);  	offmap = map_to_offmap(map); @@ -595,32 +743,11 @@ bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map)  int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev,  				    struct net_device *netdev)  { -	struct bpf_offload_netdev *ondev;  	int err; -	ondev = kzalloc(sizeof(*ondev), GFP_KERNEL); -	if (!ondev) -		return -ENOMEM; - -	ondev->netdev = netdev; -	ondev->offdev = offdev; -	INIT_LIST_HEAD(&ondev->progs); -	INIT_LIST_HEAD(&ondev->maps); -  	down_write(&bpf_devs_lock); -	err = rhashtable_insert_fast(&offdevs, &ondev->l, offdevs_params); -	if (err) { -		netdev_warn(netdev, "failed to register for BPF offload\n"); -		goto err_unlock_free; -	} - -	list_add(&ondev->offdev_netdevs, &offdev->netdevs); -	up_write(&bpf_devs_lock); -	return 0; - -err_unlock_free: +	err = __bpf_offload_dev_netdev_register(offdev, netdev);  	up_write(&bpf_devs_lock); -	kfree(ondev);  	return err;  }  EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register); @@ -628,43 +755,8 @@ EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_register);  void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev,  				       struct net_device *netdev)  { -	struct bpf_offload_netdev *ondev, *altdev; -	struct bpf_offloaded_map *offmap, *mtmp; -	struct bpf_prog_offload *offload, *ptmp; - -	ASSERT_RTNL(); -  	down_write(&bpf_devs_lock); -	ondev = rhashtable_lookup_fast(&offdevs, &netdev, offdevs_params); -	if (WARN_ON(!ondev)) -		goto unlock; - -	WARN_ON(rhashtable_remove_fast(&offdevs, &ondev->l, offdevs_params)); -	list_del(&ondev->offdev_netdevs); - -	/* Try to move the objects to another netdev of the device */ -	altdev = list_first_entry_or_null(&offdev->netdevs, -					  struct bpf_offload_netdev, -					  offdev_netdevs); -	if (altdev) { -		list_for_each_entry(offload, &ondev->progs, offloads) -			offload->netdev = altdev->netdev; -		list_splice_init(&ondev->progs, &altdev->progs); - -		list_for_each_entry(offmap, &ondev->maps, offloads) -			offmap->netdev = altdev->netdev; -		list_splice_init(&ondev->maps, &altdev->maps); -	} else { -		list_for_each_entry_safe(offload, ptmp, &ondev->progs, offloads) -			__bpf_prog_offload_destroy(offload->prog); -		list_for_each_entry_safe(offmap, mtmp, &ondev->maps, offloads) -			__bpf_map_offload_destroy(offmap); -	} - -	WARN_ON(!list_empty(&ondev->progs)); -	WARN_ON(!list_empty(&ondev->maps)); -	kfree(ondev); -unlock: +	__bpf_offload_dev_netdev_unregister(offdev, netdev);  	up_write(&bpf_devs_lock);  }  EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); @@ -673,18 +765,6 @@ struct bpf_offload_dev *  bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv)  {  	struct bpf_offload_dev *offdev; -	int err; - -	down_write(&bpf_devs_lock); -	if (!offdevs_inited) { -		err = rhashtable_init(&offdevs, &offdevs_params); -		if (err) { -			up_write(&bpf_devs_lock); -			return ERR_PTR(err); -		} -		offdevs_inited = true; -	} -	up_write(&bpf_devs_lock);  	offdev = kzalloc(sizeof(*offdev), GFP_KERNEL);  	if (!offdev) @@ -710,3 +790,67 @@ void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev)  	return offdev->priv;  }  EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); + +void bpf_dev_bound_netdev_unregister(struct net_device *dev) +{ +	struct bpf_offload_netdev *ondev; + +	ASSERT_RTNL(); + +	down_write(&bpf_devs_lock); +	ondev = bpf_offload_find_netdev(dev); +	if (ondev && !ondev->offdev) +		__bpf_offload_dev_netdev_unregister(NULL, ondev->netdev); +	up_write(&bpf_devs_lock); +} + +int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log, +			      struct bpf_prog_aux *prog_aux) +{ +	if (!bpf_prog_is_dev_bound(prog_aux)) { +		bpf_log(log, "metadata kfuncs require device-bound program\n"); +		return -EINVAL; +	} + +	if (bpf_prog_is_offloaded(prog_aux)) { +		bpf_log(log, "metadata kfuncs can't be offloaded\n"); +		return -EINVAL; +	} + +	return 0; +} + +void *bpf_dev_bound_resolve_kfunc(struct bpf_prog *prog, u32 func_id) +{ +	const struct xdp_metadata_ops *ops; +	void *p = NULL; + +	/* We don't hold bpf_devs_lock while resolving several +	 * kfuncs and can race with the unregister_netdevice(). +	 * We rely on bpf_dev_bound_match() check at attach +	 * to render this program unusable. +	 */ +	down_read(&bpf_devs_lock); +	if (!prog->aux->offload) +		goto out; + +	ops = prog->aux->offload->netdev->xdp_metadata_ops; +	if (!ops) +		goto out; + +	if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_TIMESTAMP)) +		p = ops->xmo_rx_timestamp; +	else if (func_id == bpf_xdp_metadata_kfunc_id(XDP_METADATA_KFUNC_RX_HASH)) +		p = ops->xmo_rx_hash; +out: +	up_read(&bpf_devs_lock); + +	return p; +} + +static int __init bpf_offload_init(void) +{ +	return rhashtable_init(&offdevs, &offdevs_params); +} + +late_initcall(bpf_offload_init); diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 5106b5372f0c..b56f9f3314fd 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -3,7 +3,11 @@  #include <linux/init.h>  #include <linux/module.h>  #include "bpf_preload.h" -#include "iterators/iterators.lskel.h" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#include "iterators/iterators.lskel-little-endian.h" +#else +#include "iterators/iterators.lskel-big-endian.h" +#endif  static struct bpf_link *maps_link, *progs_link;  static struct iterators_bpf *skel; diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile index 6762b1260f2f..8937dc6bc8d0 100644 --- a/kernel/bpf/preload/iterators/Makefile +++ b/kernel/bpf/preload/iterators/Makefile @@ -35,20 +35,22 @@ endif  .PHONY: all clean -all: iterators.lskel.h +all: iterators.lskel-little-endian.h + +big: iterators.lskel-big-endian.h  clean:  	$(call msg,CLEAN)  	$(Q)rm -rf $(OUTPUT) iterators -iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) +iterators.lskel-%.h: $(OUTPUT)/%/iterators.bpf.o | $(BPFTOOL)  	$(call msg,GEN-SKEL,$@)  	$(Q)$(BPFTOOL) gen skeleton -L $< > $@ - -$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT) +$(OUTPUT)/%/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)  	$(call msg,BPF,$@) -	$(Q)$(CLANG) -g -O2 -target bpf $(INCLUDES)			      \ +	$(Q)mkdir -p $(@D) +	$(Q)$(CLANG) -g -O2 -target bpf -m$* $(INCLUDES)		      \  		 -c $(filter %.c,$^) -o $@ &&				      \  	$(LLVM_STRIP) -g $@ diff --git a/kernel/bpf/preload/iterators/README b/kernel/bpf/preload/iterators/README index 7fd6d39a9ad2..98e7c90ea012 100644 --- a/kernel/bpf/preload/iterators/README +++ b/kernel/bpf/preload/iterators/README @@ -1,4 +1,7 @@  WARNING: -If you change "iterators.bpf.c" do "make -j" in this directory to rebuild "iterators.skel.h". +If you change "iterators.bpf.c" do "make -j" in this directory to +rebuild "iterators.lskel-little-endian.h". Then, on a big-endian +machine, do "make -j big" in this directory to rebuild +"iterators.lskel-big-endian.h". Commit both resulting headers.  Make sure to have clang 10 installed.  See Documentation/bpf/bpf_devel_QA.rst diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h new file mode 100644 index 000000000000..ebdc6c0cdb70 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h @@ -0,0 +1,419 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* THIS FILE IS AUTOGENERATED BY BPFTOOL! */ +#ifndef __ITERATORS_BPF_SKEL_H__ +#define __ITERATORS_BPF_SKEL_H__ + +#include <bpf/skel_internal.h> + +struct iterators_bpf { +	struct bpf_loader_ctx ctx; +	struct { +		struct bpf_map_desc rodata; +	} maps; +	struct { +		struct bpf_prog_desc dump_bpf_map; +		struct bpf_prog_desc dump_bpf_prog; +	} progs; +	struct { +		int dump_bpf_map_fd; +		int dump_bpf_prog_fd; +	} links; +}; + +static inline int +iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel) +{ +	int prog_fd = skel->progs.dump_bpf_map.prog_fd; +	int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER); + +	if (fd > 0) +		skel->links.dump_bpf_map_fd = fd; +	return fd; +} + +static inline int +iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel) +{ +	int prog_fd = skel->progs.dump_bpf_prog.prog_fd; +	int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER); + +	if (fd > 0) +		skel->links.dump_bpf_prog_fd = fd; +	return fd; +} + +static inline int +iterators_bpf__attach(struct iterators_bpf *skel) +{ +	int ret = 0; + +	ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel); +	ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel); +	return ret < 0 ? ret : 0; +} + +static inline void +iterators_bpf__detach(struct iterators_bpf *skel) +{ +	skel_closenz(skel->links.dump_bpf_map_fd); +	skel_closenz(skel->links.dump_bpf_prog_fd); +} +static void +iterators_bpf__destroy(struct iterators_bpf *skel) +{ +	if (!skel) +		return; +	iterators_bpf__detach(skel); +	skel_closenz(skel->progs.dump_bpf_map.prog_fd); +	skel_closenz(skel->progs.dump_bpf_prog.prog_fd); +	skel_closenz(skel->maps.rodata.map_fd); +	skel_free(skel); +} +static inline struct iterators_bpf * +iterators_bpf__open(void) +{ +	struct iterators_bpf *skel; + +	skel = skel_alloc(sizeof(*skel)); +	if (!skel) +		goto cleanup; +	skel->ctx.sz = (void *)&skel->links - (void *)skel; +	return skel; +cleanup: +	iterators_bpf__destroy(skel); +	return NULL; +} + +static inline int +iterators_bpf__load(struct iterators_bpf *skel) +{ +	struct bpf_load_and_run_opts opts = {}; +	int err; + +	opts.ctx = (struct bpf_loader_ctx *)skel; +	opts.data_sz = 6008; +	opts.data = (void *)"\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\ +\0\0\0\x18\0\0\0\0\0\0\x04\x1c\0\0\x04\x1c\0\0\x05\x18\0\0\0\0\x02\0\0\0\0\0\0\ +\x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\ +\0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\ +\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\ +\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc2\x04\0\0\x03\0\0\0\x18\0\0\0\ +\xd0\0\0\0\x09\0\0\0\0\0\0\0\xd4\0\0\0\x0b\0\0\0\x40\0\0\0\xdf\0\0\0\x0b\0\0\0\ +\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe7\x07\0\0\0\0\0\0\0\0\0\0\xf0\x08\0\0\ +\0\0\0\0\x0c\0\0\0\xf6\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xb3\x04\0\0\x03\0\ +\0\0\x18\0\0\x01\xbb\0\0\0\x0e\0\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x01\ +\xc3\0\0\0\x0e\0\0\0\xa0\0\0\x01\xcf\x08\0\0\0\0\0\0\x0f\0\0\x01\xd5\x01\0\0\0\ +\0\0\0\x04\0\0\0\x20\0\0\x01\xe2\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\ +\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xe7\x01\0\0\0\0\0\0\x04\0\0\ +\0\x20\0\0\0\0\x02\0\0\0\0\0\0\x14\0\0\x02\x4b\x04\0\0\x02\0\0\0\x10\0\0\0\x13\ +\0\0\0\x03\0\0\0\0\0\0\x02\x5e\0\0\0\x15\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x18\ +\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x13\0\0\x02\x63\x0c\0\0\x01\0\0\ +\0\x16\0\0\x02\xaf\x04\0\0\x01\0\0\0\x08\0\0\x02\xb8\0\0\0\x19\0\0\0\0\0\0\0\0\ +\x02\0\0\0\0\0\0\x1a\0\0\x03\x09\x04\0\0\x06\0\0\0\x38\0\0\x01\xbb\0\0\0\x0e\0\ +\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x03\x16\0\0\0\x1b\0\0\0\xc0\0\0\x03\ +\x27\0\0\0\x15\0\0\x01\0\0\0\x03\x30\0\0\0\x1d\0\0\x01\x40\0\0\x03\x3a\0\0\0\ +\x1e\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x1c\0\0\0\0\x0a\0\0\0\0\0\0\x10\0\0\0\ +\0\x02\0\0\0\0\0\0\x1f\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\x03\x84\x04\0\0\x02\0\0\ +\0\x08\0\0\x03\x92\0\0\0\x0e\0\0\0\0\0\0\x03\x9b\0\0\0\x0e\0\0\0\x20\0\0\x03\ +\x3a\x04\0\0\x03\0\0\0\x18\0\0\x03\xa5\0\0\0\x1b\0\0\0\0\0\0\x03\xad\0\0\0\x21\ +\0\0\0\x40\0\0\x03\xb3\0\0\0\x23\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x22\0\0\0\0\ +\x02\0\0\0\0\0\0\x24\0\0\x03\xb7\x04\0\0\x01\0\0\0\x04\0\0\x03\xc2\0\0\0\x0e\0\ +\0\0\0\0\0\x04\x2b\x04\0\0\x01\0\0\0\x04\0\0\x04\x34\0\0\0\x0e\0\0\0\0\0\0\0\0\ +\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\x04\xaa\x0e\0\0\0\0\0\0\ +\x25\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\x04\ +\xbe\x0e\0\0\0\0\0\0\x27\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\ +\0\0\0\x20\0\0\x04\xd4\x0e\0\0\0\0\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\ +\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\x04\xe9\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\ +\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\0\x0e\0\0\0\0\0\0\x2d\ +\0\0\0\x01\0\0\x05\x08\x0f\0\0\x04\0\0\0\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\ +\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\ +\0\x11\0\0\x05\x10\x0f\0\0\x01\0\0\0\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\x62\x70\ +\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\ +\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ +\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\ +\x30\0\x2f\x68\x6f\x6d\x65\x2f\x69\x69\x69\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\ +\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\ +\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\ +\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\ +\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\ +\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\ +\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\ +\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\ +\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\ +\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\ +\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\ +\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\ +\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\ +\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\ +\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\ +\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\ +\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\ +\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\ +\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\ +\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\ +\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\ +\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\ +\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\ +\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\ +\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\ +\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\ +\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\ +\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\ +\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\ +\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\ +\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\ +\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\ +\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\ +\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\ +\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\ +\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\ +\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\ +\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\ +\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\ +\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ +\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\ +\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\ +\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\ +\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\ +\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\ +\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\ +\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\ +\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\ +\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\ +\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\ +\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\ +\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\ +\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\ +\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\ +\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\ +\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\ +\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\ +\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\ +\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\ +\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x09\x4c\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x62\0\0\0\ +\x01\0\0\0\x80\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\ +\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\0\0\0\0\0\x20\ +\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\ +\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\ +\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\ +\0\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1a\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\ +\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe8\xbf\x16\0\0\ +\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x30\0\0\0\0\0\x23\xb7\x50\0\0\ +\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe8\0\0\0\0\xb7\ +\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xf0\0\0\ +\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\ +\0\0\xff\xff\xff\xe8\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x23\ +\xb7\x30\0\0\0\0\0\x0e\xb7\x50\0\0\0\0\0\x18\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\ +\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x3c\ +\x1e\0\0\0\x01\0\0\0\x42\0\0\0\x9a\0\x01\x3c\x24\0\0\0\x02\0\0\0\x42\0\0\x01\ +\x0d\0\x01\x44\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2e\0\x01\x4c\x06\0\0\0\x04\0\0\ +\0\x42\0\0\x01\x3d\0\x01\x40\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x62\0\x01\x58\x06\ +\0\0\0\x07\0\0\0\x42\0\0\x01\x75\0\x01\x5c\x03\0\0\0\x0e\0\0\0\x42\0\0\x01\xfb\ +\0\x01\x64\x02\0\0\0\x1e\0\0\0\x42\0\0\x02\x49\0\x01\x6c\x01\0\0\0\0\0\0\0\x02\ +\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\ +\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x70\0\0\0\x0d\ +\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x09\0\0\0\0\0\0\0\xa0\0\0\0\x0d\ +\0\0\x01\x39\0\0\0\0\0\0\0\x1a\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\ +\x6d\x61\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\ +\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\ +\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\ +\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\0\x79\ +\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\ +\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x31\xb7\x30\0\0\0\0\0\x20\xb7\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\x7b\ +\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\0\0\ +\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\x87\ +\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\ +\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\0\ +\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\ +\xb7\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\ +\0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\ +\xb7\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\ +\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3d\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\ +\xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\ +\xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\ +\xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\ +\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x51\xb7\x30\0\0\0\0\0\x11\ +\xb7\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x80\x1e\0\0\0\x01\0\0\0\ +\x42\0\0\0\x9a\0\x01\x80\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x7f\0\x01\x88\x1f\0\0\ +\0\x03\0\0\0\x42\0\0\x02\xa3\0\x01\x94\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xbc\0\ +\x01\xa0\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3d\0\x01\x84\x1d\0\0\0\x06\0\0\0\x42\ +\0\0\x01\x62\0\x01\xa4\x06\0\0\0\x08\0\0\0\x42\0\0\x02\xce\0\x01\xa8\x03\0\0\0\ +\x10\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x79\0\x01\ +\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x1b\0\0\0\x42\0\0\ +\x03\xca\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x2d\0\0\0\x1e\ +\0\0\0\x42\0\0\x04\x16\0\x01\x0c\x0d\0\0\0\x20\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\ +\x02\0\0\0\x21\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x02\0\0\0\x24\0\0\0\x42\0\0\x04\ +\x3d\0\x01\x18\x0d\0\0\0\x27\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x28\0\0\ +\0\x42\0\0\x04\x3d\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\x3d\0\x01\x18\x0d\ +\0\0\0\x2c\0\0\0\x42\0\0\x04\x6b\0\x01\x1c\x1b\0\0\0\x2d\0\0\0\x42\0\0\x04\x6b\ +\0\x01\x1c\x06\0\0\0\x2e\0\0\0\x42\0\0\x04\x8e\0\x01\x24\x0d\0\0\0\x30\0\0\0\ +\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x49\0\x01\xc0\x01\0\ +\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\ +\x10\0\0\0\x14\0\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\ +\x28\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\ +\x90\0\0\0\x1a\0\0\x01\x09\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\x03\x71\0\0\0\0\0\0\ +\0\xb0\0\0\0\x1a\0\0\x03\x75\0\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\x03\xa3\0\0\0\0\0\ +\0\0\xd8\0\0\0\x20\0\0\x01\x09\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\ +\0\x01\x18\0\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\x1a\0\0\x01\x09\0\0\0\ +\0\0\0\x01\x60\0\0\0\x20\0\0\x04\x65\0\0\0\0\0\0\x01\x88\0\0\0\x1a\0\0\x01\x39\ +\0\0\0\0\0\0\x01\x98\0\0\0\x1a\0\0\x04\xa6\0\0\0\0\0\0\x01\xa0\0\0\0\x18\0\0\0\ +\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\ +\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\ +\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\x12\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0"; +	opts.insns_sz = 2216; +	opts.insns = (void *)"\ +\xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\ +\0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\ +\x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\ +\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x80\0\0\0\0\xd5\ +\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x84\0\0\0\0\xd5\x10\0\x01\0\ +\0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\ +\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\ +\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\ +\0\x0e\x68\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\ +\0\0\0\x0e\x64\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x0e\x58\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ +\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\ +\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\ +\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xa0\x63\x10\0\0\0\ +\0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\ +\0\x0e\x7c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ +\0\0\x0e\x70\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\ +\x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\ +\x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ +\x0e\xb8\xb7\x20\0\0\0\0\0\x62\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\ +\0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\x63\ +\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x20\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x0f\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xb8\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x38\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ +\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\ +\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\x63\x10\ +\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\xb7\ +\x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\ +\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x50\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ +\x11\x70\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x58\x18\x16\0\ +\0\0\0\0\0\0\0\0\0\0\0\x11\x68\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\ +\0\0\x10\x58\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x7b\x10\0\0\0\0\0\0\x18\ +\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\x60\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xc0\ +\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\xf0\x18\x16\0\0\0\0\0\ +\0\0\0\0\0\0\0\x11\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd8\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\ +\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x78\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\ +\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x7c\x63\x10\0\0\0\0\0\0\x79\x06\0\ +\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x80\x7b\x10\0\0\0\0\0\0\x61\ +\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xa8\x63\x10\0\0\0\0\0\ +\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xf0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\ +\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\ +\xff\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\x60\x63\x07\0\x6c\0\0\0\0\ +\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\ +\0\0\0\0\0\0\0\0\0\0\x11\x60\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\ +\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd0\x61\x10\0\0\0\0\0\0\xd5\ +\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x4a\0\0\ +\0\0\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x08\x18\x16\0\ +\0\0\0\0\0\0\0\0\0\0\0\x16\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\ +\0\0\x12\x10\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd8\x7b\x10\0\0\0\0\0\0\x18\ +\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x20\ +\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x20\x18\x16\0\0\0\0\0\ +\0\0\0\0\0\0\0\x17\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x15\ +\xb0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x50\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x48\x7b\x10\0\0\0\0\ +\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xe8\x63\x10\0\0\ +\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xec\x63\x10\ +\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xf0\x7b\ +\x10\0\0\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\ +\x18\x63\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x60\xb7\x20\0\0\0\ +\0\0\x12\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\ +\0\0\0\0\0\0\xc5\x70\xff\x13\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\ +\x63\x07\0\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\ +\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\xb7\x30\0\0\0\0\0\x8c\x85\0\ +\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x40\x61\ +\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\ +\xc5\x70\xff\x01\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\ +\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\ +\0\0\x63\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\ +\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\ +\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0"; +	err = bpf_load_and_run(&opts); +	if (err < 0) +		return err; +	return 0; +} + +static inline struct iterators_bpf * +iterators_bpf__open_and_load(void) +{ +	struct iterators_bpf *skel; + +	skel = iterators_bpf__open(); +	if (!skel) +		return NULL; +	if (iterators_bpf__load(skel)) { +		iterators_bpf__destroy(skel); +		return NULL; +	} +	return skel; +} + +__attribute__((unused)) static void +iterators_bpf__assert(struct iterators_bpf *s __attribute__((unused))) +{ +#ifdef __cplusplus +#define _Static_assert static_assert +#endif +#ifdef __cplusplus +#undef _Static_assert +#endif +} + +#endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h index 70f236a82fe1..70f236a82fe1 100644 --- a/kernel/bpf/preload/iterators/iterators.lskel.h +++ b/kernel/bpf/preload/iterators/iterators.lskel-little-endian.h diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 80f4b4d88aaf..8732e0aadf36 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -269,7 +269,7 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma  		if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)  			return -EPERM;  	} else { -		vma->vm_flags &= ~VM_MAYWRITE; +		vm_flags_clear(vma, VM_MAYWRITE);  	}  	/* remap_vmalloc_range() checks size and offset constraints */  	return remap_vmalloc_range(vma, rb_map->rb, @@ -290,7 +290,7 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma  			 */  			return -EPERM;  	} else { -		vma->vm_flags &= ~VM_MAYWRITE; +		vm_flags_clear(vma, VM_MAYWRITE);  	}  	/* remap_vmalloc_range() checks size and offset constraints */  	return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 64131f88c553..adc83cb82f37 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -181,7 +181,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,  	int err;  	/* Need to create a kthread, thus must support schedule */ -	if (bpf_map_is_dev_bound(map)) { +	if (bpf_map_is_offloaded(map)) {  		return bpf_map_offload_update_elem(map, key, value, flags);  	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||  		   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) { @@ -238,7 +238,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,  	void *ptr;  	int err; -	if (bpf_map_is_dev_bound(map)) +	if (bpf_map_is_offloaded(map))  		return bpf_map_offload_lookup_elem(map, key, value);  	bpf_disable_instrumentation(); @@ -309,7 +309,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable)  	 * __GFP_RETRY_MAYFAIL to avoid such situations.  	 */ -	const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT; +	gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO);  	unsigned int flags = 0;  	unsigned long align = 1;  	void *area; @@ -390,7 +390,7 @@ static int bpf_map_alloc_id(struct bpf_map *map)  	return id > 0 ? 0 : id;  } -void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +void bpf_map_free_id(struct bpf_map *map)  {  	unsigned long flags; @@ -402,18 +402,12 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)  	if (!map->id)  		return; -	if (do_idr_lock) -		spin_lock_irqsave(&map_idr_lock, flags); -	else -		__acquire(&map_idr_lock); +	spin_lock_irqsave(&map_idr_lock, flags);  	idr_remove(&map_idr, map->id);  	map->id = 0; -	if (do_idr_lock) -		spin_unlock_irqrestore(&map_idr_lock, flags); -	else -		__release(&map_idr_lock); +	spin_unlock_irqrestore(&map_idr_lock, flags);  }  #ifdef CONFIG_MEMCG_KMEM @@ -424,7 +418,8 @@ static void bpf_map_save_memcg(struct bpf_map *map)  	 * So we have to check map->objcg for being NULL each time it's  	 * being used.  	 */ -	map->objcg = get_obj_cgroup_from_current(); +	if (memcg_bpf_enabled()) +		map->objcg = get_obj_cgroup_from_current();  }  static void bpf_map_release_memcg(struct bpf_map *map) @@ -470,6 +465,21 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)  	return ptr;  } +void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, +		       gfp_t flags) +{ +	struct mem_cgroup *memcg, *old_memcg; +	void *ptr; + +	memcg = bpf_map_get_memcg(map); +	old_memcg = set_active_memcg(memcg); +	ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); +	set_active_memcg(old_memcg); +	mem_cgroup_put(memcg); + +	return ptr; +} +  void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,  				    size_t align, gfp_t flags)  { @@ -527,9 +537,6 @@ void btf_record_free(struct btf_record *rec)  		return;  	for (i = 0; i < rec->cnt; i++) {  		switch (rec->fields[i].type) { -		case BPF_SPIN_LOCK: -		case BPF_TIMER: -			break;  		case BPF_KPTR_UNREF:  		case BPF_KPTR_REF:  			if (rec->fields[i].kptr.module) @@ -538,7 +545,11 @@ void btf_record_free(struct btf_record *rec)  			break;  		case BPF_LIST_HEAD:  		case BPF_LIST_NODE: -			/* Nothing to release for bpf_list_head */ +		case BPF_RB_ROOT: +		case BPF_RB_NODE: +		case BPF_SPIN_LOCK: +		case BPF_TIMER: +			/* Nothing to release */  			break;  		default:  			WARN_ON_ONCE(1); @@ -571,9 +582,6 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)  	new_rec->cnt = 0;  	for (i = 0; i < rec->cnt; i++) {  		switch (fields[i].type) { -		case BPF_SPIN_LOCK: -		case BPF_TIMER: -			break;  		case BPF_KPTR_UNREF:  		case BPF_KPTR_REF:  			btf_get(fields[i].kptr.btf); @@ -584,7 +592,11 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)  			break;  		case BPF_LIST_HEAD:  		case BPF_LIST_NODE: -			/* Nothing to acquire for bpf_list_head */ +		case BPF_RB_ROOT: +		case BPF_RB_NODE: +		case BPF_SPIN_LOCK: +		case BPF_TIMER: +			/* Nothing to acquire */  			break;  		default:  			ret = -EFAULT; @@ -664,7 +676,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)  				continue;  			bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);  			break; +		case BPF_RB_ROOT: +			if (WARN_ON_ONCE(rec->spin_lock_off < 0)) +				continue; +			bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off); +			break;  		case BPF_LIST_NODE: +		case BPF_RB_NODE:  			break;  		default:  			WARN_ON_ONCE(1); @@ -706,13 +724,13 @@ static void bpf_map_put_uref(struct bpf_map *map)  }  /* decrement map refcnt and schedule it for freeing via workqueue - * (unrelying map implementation ops->map_free() might sleep) + * (underlying map implementation ops->map_free() might sleep)   */ -static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) +void bpf_map_put(struct bpf_map *map)  {  	if (atomic64_dec_and_test(&map->refcnt)) {  		/* bpf_map_free_id() must be called first */ -		bpf_map_free_id(map, do_idr_lock); +		bpf_map_free_id(map);  		btf_put(map->btf);  		INIT_WORK(&map->work, bpf_map_free_deferred);  		/* Avoid spawning kworkers, since they all might contend @@ -721,11 +739,6 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)  		queue_work(system_unbound_wq, &map->work);  	}  } - -void bpf_map_put(struct bpf_map *map) -{ -	__bpf_map_put(map, true); -}  EXPORT_SYMBOL_GPL(bpf_map_put);  void bpf_map_put_with_uref(struct bpf_map *map) @@ -882,10 +895,10 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)  	/* set default open/close callbacks */  	vma->vm_ops = &bpf_map_default_vmops;  	vma->vm_private_data = map; -	vma->vm_flags &= ~VM_MAYEXEC; +	vm_flags_clear(vma, VM_MAYEXEC);  	if (!(vma->vm_flags & VM_WRITE))  		/* disallow re-mapping with PROT_WRITE */ -		vma->vm_flags &= ~VM_MAYWRITE; +		vm_flags_clear(vma, VM_MAYWRITE);  	err = map->ops->map_mmap(map, vma);  	if (err) @@ -1005,7 +1018,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,  		return -EINVAL;  	map->record = btf_parse_fields(btf, value_type, -				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD, +				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | +				       BPF_RB_ROOT,  				       map->value_size);  	if (!IS_ERR_OR_NULL(map->record)) {  		int i; @@ -1053,6 +1067,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,  				}  				break;  			case BPF_LIST_HEAD: +			case BPF_RB_ROOT:  				if (map->map_type != BPF_MAP_TYPE_HASH &&  				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&  				    map->map_type != BPF_MAP_TYPE_ARRAY) { @@ -1483,7 +1498,7 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)  		goto err_put;  	} -	if (bpf_map_is_dev_bound(map)) { +	if (bpf_map_is_offloaded(map)) {  		err = bpf_map_offload_delete_elem(map, key);  		goto out;  	} else if (IS_FD_PROG_ARRAY(map) || @@ -1547,7 +1562,7 @@ static int map_get_next_key(union bpf_attr *attr)  	if (!next_key)  		goto free_key; -	if (bpf_map_is_dev_bound(map)) { +	if (bpf_map_is_offloaded(map)) {  		err = bpf_map_offload_get_next_key(map, key, next_key);  		goto out;  	} @@ -1605,7 +1620,7 @@ int generic_map_delete_batch(struct bpf_map *map,  				   map->key_size))  			break; -		if (bpf_map_is_dev_bound(map)) { +		if (bpf_map_is_offloaded(map)) {  			err = bpf_map_offload_delete_elem(map, key);  			break;  		} @@ -1851,7 +1866,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)  		   map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||  		   map->map_type == BPF_MAP_TYPE_LRU_HASH ||  		   map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { -		if (!bpf_map_is_dev_bound(map)) { +		if (!bpf_map_is_offloaded(map)) {  			bpf_disable_instrumentation();  			rcu_read_lock();  			err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags); @@ -1944,7 +1959,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)  	if (!ops)  		return -EINVAL; -	if (!bpf_prog_is_dev_bound(prog->aux)) +	if (!bpf_prog_is_offloaded(prog->aux))  		prog->aux->ops = ops;  	else  		prog->aux->ops = &bpf_offload_prog_ops; @@ -1972,7 +1987,7 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op)  		return;  	if (audit_enabled == AUDIT_OFF)  		return; -	if (op == BPF_AUDIT_LOAD) +	if (!in_irq() && !irqs_disabled())  		ctx = audit_context();  	ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);  	if (unlikely(!ab)) @@ -2001,7 +2016,7 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)  	return id > 0 ? 0 : id;  } -void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +void bpf_prog_free_id(struct bpf_prog *prog)  {  	unsigned long flags; @@ -2013,18 +2028,10 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)  	if (!prog->aux->id)  		return; -	if (do_idr_lock) -		spin_lock_irqsave(&prog_idr_lock, flags); -	else -		__acquire(&prog_idr_lock); - +	spin_lock_irqsave(&prog_idr_lock, flags);  	idr_remove(&prog_idr, prog->aux->id);  	prog->aux->id = 0; - -	if (do_idr_lock) -		spin_unlock_irqrestore(&prog_idr_lock, flags); -	else -		__release(&prog_idr_lock); +	spin_unlock_irqrestore(&prog_idr_lock, flags);  }  static void __bpf_prog_put_rcu(struct rcu_head *rcu) @@ -2067,17 +2074,15 @@ static void bpf_prog_put_deferred(struct work_struct *work)  	prog = aux->prog;  	perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);  	bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); +	bpf_prog_free_id(prog);  	__bpf_prog_put_noref(prog, true);  } -static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) +static void __bpf_prog_put(struct bpf_prog *prog)  {  	struct bpf_prog_aux *aux = prog->aux;  	if (atomic64_dec_and_test(&aux->refcnt)) { -		/* bpf_prog_free_id() must be called first */ -		bpf_prog_free_id(prog, do_idr_lock); -  		if (in_irq() || irqs_disabled()) {  			INIT_WORK(&aux->work, bpf_prog_put_deferred);  			schedule_work(&aux->work); @@ -2089,7 +2094,7 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)  void bpf_prog_put(struct bpf_prog *prog)  { -	__bpf_prog_put(prog, true); +	__bpf_prog_put(prog);  }  EXPORT_SYMBOL_GPL(bpf_prog_put); @@ -2255,7 +2260,7 @@ bool bpf_prog_get_ok(struct bpf_prog *prog,  	if (prog->type != *attach_type)  		return false; -	if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) +	if (bpf_prog_is_offloaded(prog->aux) && !attach_drv)  		return false;  	return true; @@ -2491,7 +2496,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)  				 BPF_F_TEST_STATE_FREQ |  				 BPF_F_SLEEPABLE |  				 BPF_F_TEST_RND_HI32 | -				 BPF_F_XDP_HAS_FRAGS)) +				 BPF_F_XDP_HAS_FRAGS | +				 BPF_F_XDP_DEV_BOUND_ONLY))  		return -EINVAL;  	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -2575,7 +2581,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)  	prog->aux->attach_btf = attach_btf;  	prog->aux->attach_btf_id = attr->attach_btf_id;  	prog->aux->dst_prog = dst_prog; -	prog->aux->offload_requested = !!attr->prog_ifindex; +	prog->aux->dev_bound = !!attr->prog_ifindex;  	prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;  	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; @@ -2599,7 +2605,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)  	prog->gpl_compatible = is_gpl ? 1 : 0;  	if (bpf_prog_is_dev_bound(prog->aux)) { -		err = bpf_prog_offload_init(prog, attr); +		err = bpf_prog_dev_bound_init(prog, attr); +		if (err) +			goto free_prog_sec; +	} + +	if (type == BPF_PROG_TYPE_EXT && dst_prog && +	    bpf_prog_is_dev_bound(dst_prog->aux)) { +		err = bpf_prog_dev_bound_inherit(prog, dst_prog);  		if (err)  			goto free_prog_sec;  	} @@ -3997,7 +4010,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,  			return -EFAULT;  	} -	if (bpf_prog_is_dev_bound(prog->aux)) { +	if (bpf_prog_is_offloaded(prog->aux)) {  		err = bpf_prog_offload_info_fill(&info, prog);  		if (err)  			return err; @@ -4225,7 +4238,7 @@ static int bpf_map_get_info_by_fd(struct file *file,  	}  	info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; -	if (bpf_map_is_dev_bound(map)) { +	if (bpf_map_is_offloaded(map)) {  		err = bpf_map_offload_info_fill(&info, map);  		if (err)  			return err; @@ -5319,7 +5332,6 @@ static struct ctl_table bpf_syscall_table[] = {  	{  		.procname	= "bpf_stats_enabled",  		.data		= &bpf_stats_enabled_key.key, -		.maxlen		= sizeof(bpf_stats_enabled_key),  		.mode		= 0644,  		.proc_handler	= bpf_stats_handler,  	}, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 85f96c1e9f62..272563a0b770 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -190,6 +190,10 @@ struct bpf_verifier_stack_elem {  static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);  static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); +static void invalidate_non_owning_refs(struct bpf_verifier_env *env); +static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); +static int ref_set_non_owning(struct bpf_verifier_env *env, +			      struct bpf_reg_state *reg);  static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)  { @@ -255,6 +259,7 @@ struct bpf_call_arg_meta {  	int mem_size;  	u64 msize_max_value;  	int ref_obj_id; +	int dynptr_id;  	int map_uid;  	int func_id;  	struct btf *btf; @@ -456,6 +461,11 @@ static bool type_is_ptr_alloc_obj(u32 type)  	return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;  } +static bool type_is_non_owning_ref(u32 type) +{ +	return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; +} +  static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)  {  	struct btf_record *rec = NULL; @@ -638,31 +648,57 @@ static void print_liveness(struct bpf_verifier_env *env,  		verbose(env, "D");  } -static int get_spi(s32 off) +static int __get_spi(s32 off)  {  	return (-off - 1) / BPF_REG_SIZE;  } +static struct bpf_func_state *func(struct bpf_verifier_env *env, +				   const struct bpf_reg_state *reg) +{ +	struct bpf_verifier_state *cur = env->cur_state; + +	return cur->frame[reg->frameno]; +} +  static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)  { -	int allocated_slots = state->allocated_stack / BPF_REG_SIZE; +       int allocated_slots = state->allocated_stack / BPF_REG_SIZE; -	/* We need to check that slots between [spi - nr_slots + 1, spi] are -	 * within [0, allocated_stack). -	 * -	 * Please note that the spi grows downwards. For example, a dynptr -	 * takes the size of two stack slots; the first slot will be at -	 * spi and the second slot will be at spi - 1. -	 */ -	return spi - nr_slots + 1 >= 0 && spi < allocated_slots; +       /* We need to check that slots between [spi - nr_slots + 1, spi] are +	* within [0, allocated_stack). +	* +	* Please note that the spi grows downwards. For example, a dynptr +	* takes the size of two stack slots; the first slot will be at +	* spi and the second slot will be at spi - 1. +	*/ +       return spi - nr_slots + 1 >= 0 && spi < allocated_slots;  } -static struct bpf_func_state *func(struct bpf_verifier_env *env, -				   const struct bpf_reg_state *reg) +static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)  { -	struct bpf_verifier_state *cur = env->cur_state; +	int off, spi; -	return cur->frame[reg->frameno]; +	if (!tnum_is_const(reg->var_off)) { +		verbose(env, "dynptr has to be at a constant offset\n"); +		return -EINVAL; +	} + +	off = reg->off + reg->var_off.value; +	if (off % BPF_REG_SIZE) { +		verbose(env, "cannot pass in dynptr at an offset=%d\n", off); +		return -EINVAL; +	} + +	spi = __get_spi(off); +	if (spi < 1) { +		verbose(env, "cannot pass in dynptr at an offset=%d\n", off); +		return -EINVAL; +	} + +	if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS)) +		return -ERANGE; +	return spi;  }  static const char *kernel_type_name(const struct btf* btf, u32 id) @@ -727,37 +763,58 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)  static void __mark_dynptr_reg(struct bpf_reg_state *reg,  			      enum bpf_dynptr_type type, -			      bool first_slot); +			      bool first_slot, int dynptr_id);  static void __mark_reg_not_init(const struct bpf_verifier_env *env,  				struct bpf_reg_state *reg); -static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1, +static void mark_dynptr_stack_regs(struct bpf_verifier_env *env, +				   struct bpf_reg_state *sreg1,  				   struct bpf_reg_state *sreg2,  				   enum bpf_dynptr_type type)  { -	__mark_dynptr_reg(sreg1, type, true); -	__mark_dynptr_reg(sreg2, type, false); +	int id = ++env->id_gen; + +	__mark_dynptr_reg(sreg1, type, true, id); +	__mark_dynptr_reg(sreg2, type, false, id);  } -static void mark_dynptr_cb_reg(struct bpf_reg_state *reg, +static void mark_dynptr_cb_reg(struct bpf_verifier_env *env, +			       struct bpf_reg_state *reg,  			       enum bpf_dynptr_type type)  { -	__mark_dynptr_reg(reg, type, true); +	__mark_dynptr_reg(reg, type, true, ++env->id_gen);  } +static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, +				        struct bpf_func_state *state, int spi);  static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,  				   enum bpf_arg_type arg_type, int insn_idx)  {  	struct bpf_func_state *state = func(env, reg);  	enum bpf_dynptr_type type; -	int spi, i, id; - -	spi = get_spi(reg->off); - -	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) -		return -EINVAL; +	int spi, i, id, err; + +	spi = dynptr_get_spi(env, reg); +	if (spi < 0) +		return spi; + +	/* We cannot assume both spi and spi - 1 belong to the same dynptr, +	 * hence we need to call destroy_if_dynptr_stack_slot twice for both, +	 * to ensure that for the following example: +	 *	[d1][d1][d2][d2] +	 * spi    3   2   1   0 +	 * So marking spi = 2 should lead to destruction of both d1 and d2. In +	 * case they do belong to same dynptr, second call won't see slot_type +	 * as STACK_DYNPTR and will simply skip destruction. +	 */ +	err = destroy_if_dynptr_stack_slot(env, state, spi); +	if (err) +		return err; +	err = destroy_if_dynptr_stack_slot(env, state, spi - 1); +	if (err) +		return err;  	for (i = 0; i < BPF_REG_SIZE; i++) {  		state->stack[spi].slot_type[i] = STACK_DYNPTR; @@ -768,7 +825,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_  	if (type == BPF_DYNPTR_TYPE_INVALID)  		return -EINVAL; -	mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr, +	mark_dynptr_stack_regs(env, &state->stack[spi].spilled_ptr,  			       &state->stack[spi - 1].spilled_ptr, type);  	if (dynptr_type_refcounted(type)) { @@ -781,6 +838,9 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_  		state->stack[spi - 1].spilled_ptr.ref_obj_id = id;  	} +	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; +	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; +  	return 0;  } @@ -789,10 +849,9 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re  	struct bpf_func_state *state = func(env, reg);  	int spi, i; -	spi = get_spi(reg->off); - -	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) -		return -EINVAL; +	spi = dynptr_get_spi(env, reg); +	if (spi < 0) +		return spi;  	for (i = 0; i < BPF_REG_SIZE; i++) {  		state->stack[spi].slot_type[i] = STACK_INVALID; @@ -805,43 +864,133 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re  	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr);  	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + +	/* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? +	 * +	 * While we don't allow reading STACK_INVALID, it is still possible to +	 * do <8 byte writes marking some but not all slots as STACK_MISC. Then, +	 * helpers or insns can do partial read of that part without failing, +	 * but check_stack_range_initialized, check_stack_read_var_off, and +	 * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of +	 * the slot conservatively. Hence we need to prevent those liveness +	 * marking walks. +	 * +	 * This was not a problem before because STACK_INVALID is only set by +	 * default (where the default reg state has its reg->parent as NULL), or +	 * in clean_live_states after REG_LIVE_DONE (at which point +	 * mark_reg_read won't walk reg->parent chain), but not randomly during +	 * verifier state exploration (like we did above). Hence, for our case +	 * parentage chain will still be live (i.e. reg->parent may be +	 * non-NULL), while earlier reg->parent was NULL, so we need +	 * REG_LIVE_WRITTEN to screen off read marker propagation when it is +	 * done later on reads or by mark_dynptr_read as well to unnecessary +	 * mark registers in verifier state. +	 */ +	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; +	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; +  	return 0;  } -static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static void __mark_reg_unknown(const struct bpf_verifier_env *env, +			       struct bpf_reg_state *reg); + +static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, +				        struct bpf_func_state *state, int spi)  { -	struct bpf_func_state *state = func(env, reg); -	int spi, i; +	struct bpf_func_state *fstate; +	struct bpf_reg_state *dreg; +	int i, dynptr_id; -	if (reg->type == CONST_PTR_TO_DYNPTR) -		return false; +	/* We always ensure that STACK_DYNPTR is never set partially, +	 * hence just checking for slot_type[0] is enough. This is +	 * different for STACK_SPILL, where it may be only set for +	 * 1 byte, so code has to use is_spilled_reg. +	 */ +	if (state->stack[spi].slot_type[0] != STACK_DYNPTR) +		return 0; -	spi = get_spi(reg->off); -	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) -		return true; +	/* Reposition spi to first slot */ +	if (!state->stack[spi].spilled_ptr.dynptr.first_slot) +		spi = spi + 1; +	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { +		verbose(env, "cannot overwrite referenced dynptr\n"); +		return -EINVAL; +	} + +	mark_stack_slot_scratched(env, spi); +	mark_stack_slot_scratched(env, spi - 1); + +	/* Writing partially to one dynptr stack slot destroys both. */  	for (i = 0; i < BPF_REG_SIZE; i++) { -		if (state->stack[spi].slot_type[i] == STACK_DYNPTR || -		    state->stack[spi - 1].slot_type[i] == STACK_DYNPTR) -			return false; +		state->stack[spi].slot_type[i] = STACK_INVALID; +		state->stack[spi - 1].slot_type[i] = STACK_INVALID;  	} +	dynptr_id = state->stack[spi].spilled_ptr.id; +	/* Invalidate any slices associated with this dynptr */ +	bpf_for_each_reg_in_vstate(env->cur_state, fstate, dreg, ({ +		/* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */ +		if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM) +			continue; +		if (dreg->dynptr_id == dynptr_id) { +			if (!env->allow_ptr_leaks) +				__mark_reg_not_init(env, dreg); +			else +				__mark_reg_unknown(env, dreg); +		} +	})); + +	/* Do not release reference state, we are destroying dynptr on stack, +	 * not using some helper to release it. Just reset register. +	 */ +	__mark_reg_not_init(env, &state->stack[spi].spilled_ptr); +	__mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); + +	/* Same reason as unmark_stack_slots_dynptr above */ +	state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; +	state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; + +	return 0; +} + +static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg, +				       int spi) +{ +	if (reg->type == CONST_PTR_TO_DYNPTR) +		return false; + +	/* For -ERANGE (i.e. spi not falling into allocated stack slots), we +	 * will do check_mem_access to check and update stack bounds later, so +	 * return true for that case. +	 */ +	if (spi < 0) +		return spi == -ERANGE; +	/* We allow overwriting existing unreferenced STACK_DYNPTR slots, see +	 * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to +	 * ensure dynptr objects at the slots we are touching are completely +	 * destructed before we reinitialize them for a new one. For referenced +	 * ones, destroy_if_dynptr_stack_slot returns an error early instead of +	 * delaying it until the end where the user will get "Unreleased +	 * reference" error. +	 */  	return true;  } -static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, +				     int spi)  {  	struct bpf_func_state *state = func(env, reg); -	int spi;  	int i;  	/* This already represents first slot of initialized bpf_dynptr */  	if (reg->type == CONST_PTR_TO_DYNPTR)  		return true; -	spi = get_spi(reg->off); -	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || -	    !state->stack[spi].spilled_ptr.dynptr.first_slot) +	if (spi < 0) +		return false; +	if (!state->stack[spi].spilled_ptr.dynptr.first_slot)  		return false;  	for (i = 0; i < BPF_REG_SIZE; i++) { @@ -868,7 +1017,9 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg  	if (reg->type == CONST_PTR_TO_DYNPTR) {  		return reg->dynptr.type == dynptr_type;  	} else { -		spi = get_spi(reg->off); +		spi = dynptr_get_spi(env, reg); +		if (spi < 0) +			return false;  		return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;  	}  } @@ -931,6 +1082,8 @@ static void print_verifier_state(struct bpf_verifier_env *env,  				verbose_a("id=%d", reg->id);  			if (reg->ref_obj_id)  				verbose_a("ref_obj_id=%d", reg->ref_obj_id); +			if (type_is_non_owning_ref(reg->type)) +				verbose_a("%s", "non_own_ref");  			if (t != SCALAR_VALUE)  				verbose_a("off=%d", reg->off);  			if (type_is_pkt_pointer(t)) @@ -1404,9 +1557,11 @@ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)   */  static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)  { -	/* Clear id, off, and union(map_ptr, range) */ +	/* Clear off and union(map_ptr, range) */  	memset(((u8 *)reg) + sizeof(reg->type), 0,  	       offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); +	reg->id = 0; +	reg->ref_obj_id = 0;  	___mark_reg_known(reg, imm);  } @@ -1447,7 +1602,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,  }  static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, -			      bool first_slot) +			      bool first_slot, int dynptr_id)  {  	/* reg->type has no meaning for STACK_DYNPTR, but when we set reg for  	 * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply @@ -1455,6 +1610,8 @@ static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type ty  	 */  	__mark_reg_known_zero(reg);  	reg->type = CONST_PTR_TO_DYNPTR; +	/* Give each dynptr a unique id to uniquely associate slices to it. */ +	reg->id = dynptr_id;  	reg->dynptr.type = type;  	reg->dynptr.first_slot = first_slot;  } @@ -1486,6 +1643,16 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)  	reg->type &= ~PTR_MAYBE_NULL;  } +static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno, +				struct btf_field_graph_root *ds_head) +{ +	__mark_reg_known_zero(®s[regno]); +	regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC; +	regs[regno].btf = ds_head->btf; +	regs[regno].btf_id = ds_head->value_btf_id; +	regs[regno].off = ds_head->node_offset; +} +  static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)  {  	return type_is_pkt_pointer(reg->type); @@ -1752,11 +1919,13 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,  			       struct bpf_reg_state *reg)  {  	/* -	 * Clear type, id, off, and union(map_ptr, range) and +	 * Clear type, off, and union(map_ptr, range) and  	 * padding between 'type' and union  	 */  	memset(reg, 0, offsetof(struct bpf_reg_state, var_off));  	reg->type = SCALAR_VALUE; +	reg->id = 0; +	reg->ref_obj_id = 0;  	reg->var_off = tnum_unknown;  	reg->frameno = 0;  	reg->precise = !env->bpf_capable; @@ -2185,6 +2354,12 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)  		return -EINVAL;  	} +	if (bpf_dev_bound_kfunc_id(func_id)) { +		err = bpf_dev_bound_kfunc_check(&env->log, prog_aux); +		if (err) +			return err; +	} +  	desc = &tab->descs[tab->nr_descs++];  	desc->func_id = func_id;  	desc->imm = call_imm; @@ -2386,6 +2561,32 @@ static int mark_reg_read(struct bpf_verifier_env *env,  	return 0;  } +static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ +	struct bpf_func_state *state = func(env, reg); +	int spi, ret; + +	/* For CONST_PTR_TO_DYNPTR, it must have already been done by +	 * check_reg_arg in check_helper_call and mark_btf_func_reg_size in +	 * check_kfunc_call. +	 */ +	if (reg->type == CONST_PTR_TO_DYNPTR) +		return 0; +	spi = dynptr_get_spi(env, reg); +	if (spi < 0) +		return spi; +	/* Caller ensures dynptr is valid and initialized, which means spi is in +	 * bounds and spi is the first dynptr slot. Simply mark stack slot as +	 * read. +	 */ +	ret = mark_reg_read(env, &state->stack[spi].spilled_ptr, +			    state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64); +	if (ret) +		return ret; +	return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr, +			     state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64); +} +  /* This function is supposed to be used by the following 32-bit optimization   * code only. It returns TRUE if the source or destination register operates   * on 64-bit, otherwise return FALSE. @@ -2748,6 +2949,12 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,  			 */  			if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))  				return -ENOTSUPP; +			/* kfunc with imm==0 is invalid and fixup_kfunc_call will +			 * catch this error later. Make backtracking conservative +			 * with ENOTSUPP. +			 */ +			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && insn->imm == 0) +				return -ENOTSUPP;  			/* regular helper call sets R0 */  			*reg_mask &= ~1;  			if (*reg_mask & 0x3f) { @@ -3237,13 +3444,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks,  	return reg->type != SCALAR_VALUE;  } +/* Copy src state preserving dst->parent and dst->live fields */ +static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) +{ +	struct bpf_reg_state *parent = dst->parent; +	enum bpf_reg_liveness live = dst->live; + +	*dst = *src; +	dst->parent = parent; +	dst->live = live; +} +  static void save_register_state(struct bpf_func_state *state,  				int spi, struct bpf_reg_state *reg,  				int size)  {  	int i; -	state->stack[spi].spilled_ptr = *reg; +	copy_register_state(&state->stack[spi].spilled_ptr, reg);  	if (size == BPF_REG_SIZE)  		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; @@ -3255,6 +3473,11 @@ static void save_register_state(struct bpf_func_state *state,  		scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]);  } +static bool is_bpf_st_mem(struct bpf_insn *insn) +{ +	return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM; +} +  /* check_stack_{read,write}_fixed_off functions track spill/fill of registers,   * stack boundary and alignment are checked in check_mem_access()   */ @@ -3266,8 +3489,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,  {  	struct bpf_func_state *cur; /* state of the current function */  	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; -	u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; +	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];  	struct bpf_reg_state *reg = NULL; +	u32 dst_reg = insn->dst_reg;  	err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE));  	if (err) @@ -3289,7 +3513,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,  		bool sanitize = reg && is_spillable_regtype(reg->type);  		for (i = 0; i < size; i++) { -			if (state->stack[spi].slot_type[i] == STACK_INVALID) { +			u8 type = state->stack[spi].slot_type[i]; + +			if (type != STACK_MISC && type != STACK_ZERO) {  				sanitize = true;  				break;  			} @@ -3299,6 +3525,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,  			env->insn_aux_data[insn_idx].sanitize_stack_spill = true;  	} +	err = destroy_if_dynptr_stack_slot(env, state, spi); +	if (err) +		return err; +  	mark_stack_slot_scratched(env, spi);  	if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&  	    !register_is_null(reg) && env->bpf_capable) { @@ -3314,6 +3544,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,  				return err;  		}  		save_register_state(state, spi, reg, size); +	} else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && +		   insn->imm != 0 && env->bpf_capable) { +		struct bpf_reg_state fake_reg = {}; + +		__mark_reg_known(&fake_reg, (u32)insn->imm); +		fake_reg.type = SCALAR_VALUE; +		save_register_state(state, spi, &fake_reg, size);  	} else if (reg && is_spillable_regtype(reg->type)) {  		/* register containing pointer is being spilled into stack */  		if (size != BPF_REG_SIZE) { @@ -3348,7 +3585,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,  			state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;  		/* when we zero initialize stack slots mark them as such */ -		if (reg && register_is_null(reg)) { +		if ((reg && register_is_null(reg)) || +		    (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) {  			/* backtracking doesn't work for STACK_ZERO yet. */  			err = mark_chain_precision(env, value_regno);  			if (err) @@ -3393,6 +3631,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,  	int min_off, max_off;  	int i, err;  	struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; +	struct bpf_insn *insn = &env->prog->insnsi[insn_idx];  	bool writing_zero = false;  	/* set if the fact that we're writing a zero is used to let any  	 * stack slots remain STACK_ZERO @@ -3405,13 +3644,22 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,  	max_off = ptr_reg->smax_value + off + size;  	if (value_regno >= 0)  		value_reg = &cur->regs[value_regno]; -	if (value_reg && register_is_null(value_reg)) +	if ((value_reg && register_is_null(value_reg)) || +	    (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0))  		writing_zero = true;  	err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE));  	if (err)  		return err; +	for (i = min_off; i < max_off; i++) { +		int spi; + +		spi = __get_spi(i); +		err = destroy_if_dynptr_stack_slot(env, state, spi); +		if (err) +			return err; +	}  	/* Variable offset writes destroy any spilled pointers in range. */  	for (i = min_off; i < max_off; i++) { @@ -3569,7 +3817,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,  				 */  				s32 subreg_def = state->regs[dst_regno].subreg_def; -				state->regs[dst_regno] = *reg; +				copy_register_state(&state->regs[dst_regno], reg);  				state->regs[dst_regno].subreg_def = subreg_def;  			} else {  				for (i = 0; i < size; i++) { @@ -3590,7 +3838,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,  		if (dst_regno >= 0) {  			/* restore register state from stack */ -			state->regs[dst_regno] = *reg; +			copy_register_state(&state->regs[dst_regno], reg);  			/* mark reg as written since spilled pointer state likely  			 * has its liveness marks cleared by is_state_visited()  			 * which resets stack/reg liveness for state transitions @@ -4751,6 +4999,25 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)  	return 0;  } +#define BTF_TYPE_SAFE_NESTED(__type)  __PASTE(__type, __safe_fields) + +BTF_TYPE_SAFE_NESTED(struct task_struct) { +	const cpumask_t *cpus_ptr; +}; + +static bool nested_ptr_is_trusted(struct bpf_verifier_env *env, +				  struct bpf_reg_state *reg, +				  int off) +{ +	/* If its parent is not trusted, it can't regain its trusted status. */ +	if (!is_trusted_reg(reg)) +		return false; + +	BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct)); + +	return btf_nested_type_is_trusted(&env->log, reg, off); +} +  static int check_ptr_to_btf_access(struct bpf_verifier_env *env,  				   struct bpf_reg_state *regs,  				   int regno, int off, int size, @@ -4822,7 +5089,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,  			return -EACCES;  		} -		if (type_is_alloc(reg->type) && !reg->ref_obj_id) { +		if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && +		    !reg->ref_obj_id) {  			verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");  			return -EFAULT;  		} @@ -4839,10 +5107,17 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,  	if (type_flag(reg->type) & PTR_UNTRUSTED)  		flag |= PTR_UNTRUSTED; -	/* By default any pointer obtained from walking a trusted pointer is -	 * no longer trusted except the rcu case below. +	/* By default any pointer obtained from walking a trusted pointer is no +	 * longer trusted, unless the field being accessed has explicitly been +	 * marked as inheriting its parent's state of trust. +	 * +	 * An RCU-protected pointer can also be deemed trusted if we are in an +	 * RCU read region. This case is handled below.  	 */ -	flag &= ~PTR_TRUSTED; +	if (nested_ptr_is_trusted(env, reg, off)) +		flag |= PTR_TRUSTED; +	else +		flag &= ~PTR_TRUSTED;  	if (flag & MEM_RCU) {  		/* Mark value register as MEM_RCU only if it is protected by @@ -5439,6 +5714,31 @@ static int check_stack_range_initialized(  	}  	if (meta && meta->raw_mode) { +		/* Ensure we won't be overwriting dynptrs when simulating byte +		 * by byte access in check_helper_call using meta.access_size. +		 * This would be a problem if we have a helper in the future +		 * which takes: +		 * +		 *	helper(uninit_mem, len, dynptr) +		 * +		 * Now, uninint_mem may overlap with dynptr pointer. Hence, it +		 * may end up writing to dynptr itself when touching memory from +		 * arg 1. This can be relaxed on a case by case basis for known +		 * safe cases, but reject due to the possibilitiy of aliasing by +		 * default. +		 */ +		for (i = min_off; i < max_off + access_size; i++) { +			int stack_off = -i - 1; + +			spi = __get_spi(i); +			/* raw_mode may write past allocated_stack */ +			if (state->allocated_stack <= stack_off) +				continue; +			if (state->stack[spi].slot_type[stack_off % BPF_REG_SIZE] == STACK_DYNPTR) { +				verbose(env, "potential write to dynptr at off=%d disallowed\n", i); +				return -EACCES; +			} +		}  		meta->access_size = access_size;  		meta->regno = regno;  		return 0; @@ -5780,9 +6080,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,  			cur->active_lock.ptr = btf;  		cur->active_lock.id = reg->id;  	} else { -		struct bpf_func_state *fstate = cur_func(env);  		void *ptr; -		int i;  		if (map)  			ptr = map; @@ -5798,25 +6096,11 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,  			verbose(env, "bpf_spin_unlock of different lock\n");  			return -EINVAL;  		} -		cur->active_lock.ptr = NULL; -		cur->active_lock.id = 0; -		for (i = fstate->acquired_refs - 1; i >= 0; i--) { -			int err; +		invalidate_non_owning_refs(env); -			/* Complain on error because this reference state cannot -			 * be freed before this point, as bpf_spin_lock critical -			 * section does not allow functions that release the -			 * allocated object immediately. -			 */ -			if (!fstate->refs[i].release_on_unlock) -				continue; -			err = release_reference(env, fstate->refs[i].id); -			if (err) { -				verbose(env, "failed to release release_on_unlock reference"); -				return err; -			} -		} +		cur->active_lock.ptr = NULL; +		cur->active_lock.id = 0;  	}  	return 0;  } @@ -5926,6 +6210,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,  			enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)  {  	struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; +	int spi = 0;  	/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an  	 * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): @@ -5936,12 +6221,14 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,  	}  	/* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to  	 * check_func_arg_reg_off's logic. We only need to check offset -	 * alignment for PTR_TO_STACK. +	 * and its alignment for PTR_TO_STACK.  	 */ -	if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) { -		verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off); -		return -EINVAL; +	if (reg->type == PTR_TO_STACK) { +		spi = dynptr_get_spi(env, reg); +		if (spi < 0 && spi != -ERANGE) +			return spi;  	} +  	/*  MEM_UNINIT - Points to memory that is an appropriate candidate for  	 *		 constructing a mutable bpf_dynptr object.  	 * @@ -5958,7 +6245,7 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,  	 *		 to.  	 */  	if (arg_type & MEM_UNINIT) { -		if (!is_dynptr_reg_valid_uninit(env, reg)) { +		if (!is_dynptr_reg_valid_uninit(env, reg, spi)) {  			verbose(env, "Dynptr has to be an uninitialized dynptr\n");  			return -EINVAL;  		} @@ -5973,13 +6260,15 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,  		meta->uninit_dynptr_regno = regno;  	} else /* MEM_RDONLY and None case from above */ { +		int err; +  		/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */  		if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {  			verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");  			return -EINVAL;  		} -		if (!is_dynptr_reg_valid_init(env, reg)) { +		if (!is_dynptr_reg_valid_init(env, reg, spi)) {  			verbose(env,  				"Expected an initialized dynptr as arg #%d\n",  				regno); @@ -6006,6 +6295,10 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,  				err_extra, regno);  			return -EINVAL;  		} + +		err = mark_dynptr_read(env, reg); +		if (err) +			return err;  	}  	return 0;  } @@ -6275,6 +6568,23 @@ found:  	return 0;  } +static struct btf_field * +reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields) +{ +	struct btf_field *field; +	struct btf_record *rec; + +	rec = reg_btf_record(reg); +	if (!rec) +		return NULL; + +	field = btf_record_find(rec, off, fields); +	if (!field) +		return NULL; + +	return field; +} +  int check_func_arg_reg_off(struct bpf_verifier_env *env,  			   const struct bpf_reg_state *reg, int regno,  			   enum bpf_arg_type arg_type) @@ -6296,6 +6606,18 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,  		 */  		if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)  			return 0; + +		if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) { +			if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT)) +				return __check_ptr_off_reg(env, reg, regno, true); + +			verbose(env, "R%d must have zero offset when passed to release func\n", +				regno); +			verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno, +				kernel_type_name(reg->btf, reg->btf_id), reg->off); +			return -EINVAL; +		} +  		/* Doing check_ptr_off_reg check for the offset will catch this  		 * because fixed_off_ok is false, but checking here allows us  		 * to give the user a better error message. @@ -6330,6 +6652,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,  	case PTR_TO_BTF_ID | PTR_TRUSTED:  	case PTR_TO_BTF_ID | MEM_RCU:  	case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: +	case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:  		/* When referenced PTR_TO_BTF_ID is passed to release function,  		 * its fixed offset must be 0. In the other cases, fixed offset  		 * can be non-zero. This was already checked above. So pass @@ -6343,15 +6666,29 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,  	}  } -static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)  {  	struct bpf_func_state *state = func(env, reg);  	int spi;  	if (reg->type == CONST_PTR_TO_DYNPTR) -		return reg->ref_obj_id; +		return reg->id; +	spi = dynptr_get_spi(env, reg); +	if (spi < 0) +		return spi; +	return state->stack[spi].spilled_ptr.id; +} -	spi = get_spi(reg->off); +static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ +	struct bpf_func_state *state = func(env, reg); +	int spi; + +	if (reg->type == CONST_PTR_TO_DYNPTR) +		return reg->ref_obj_id; +	spi = dynptr_get_spi(env, reg); +	if (spi < 0) +		return spi;  	return state->stack[spi].spilled_ptr.ref_obj_id;  } @@ -6425,9 +6762,8 @@ skip_type_check:  			 * PTR_TO_STACK.  			 */  			if (reg->type == PTR_TO_STACK) { -				spi = get_spi(reg->off); -				if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || -				    !state->stack[spi].spilled_ptr.ref_obj_id) { +				spi = dynptr_get_spi(env, reg); +				if (spi < 0 || !state->stack[spi].spilled_ptr.ref_obj_id) {  					verbose(env, "arg %d is an unacquired reference\n", regno);  					return -EINVAL;  				} @@ -6528,6 +6864,10 @@ skip_type_check:  		meta->ret_btf_id = reg->btf_id;  		break;  	case ARG_PTR_TO_SPIN_LOCK: +		if (in_rbtree_lock_required_cb(env)) { +			verbose(env, "can't spin_{lock,unlock} in rbtree cb\n"); +			return -EACCES; +		}  		if (meta->func_id == BPF_FUNC_spin_lock) {  			err = process_spin_lock(env, regno, true);  			if (err) @@ -7079,6 +7419,17 @@ static int release_reference(struct bpf_verifier_env *env,  	return 0;  } +static void invalidate_non_owning_refs(struct bpf_verifier_env *env) +{ +	struct bpf_func_state *unused; +	struct bpf_reg_state *reg; + +	bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ +		if (type_is_non_owning_ref(reg->type)) +			__mark_reg_unknown(env, reg); +	})); +} +  static void clear_caller_saved_regs(struct bpf_verifier_env *env,  				    struct bpf_reg_state *regs)  { @@ -7100,6 +7451,8 @@ static int set_callee_state(struct bpf_verifier_env *env,  			    struct bpf_func_state *caller,  			    struct bpf_func_state *callee, int insn_idx); +static bool is_callback_calling_kfunc(u32 btf_id); +  static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			     int *insn_idx, int subprog,  			     set_callee_state_fn set_callee_state_cb) @@ -7154,10 +7507,18 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn  	 * interested in validating only BPF helpers that can call subprogs as  	 * callbacks  	 */ -	if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) { -		verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n", -			func_id_name(insn->imm), insn->imm); -		return -EFAULT; +	if (set_callee_state_cb != set_callee_state) { +		if (bpf_pseudo_kfunc_call(insn) && +		    !is_callback_calling_kfunc(insn->imm)) { +			verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", +				func_id_name(insn->imm), insn->imm); +			return -EFAULT; +		} else if (!bpf_pseudo_kfunc_call(insn) && +			   !is_callback_calling_function(insn->imm)) { /* helper */ +			verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", +				func_id_name(insn->imm), insn->imm); +			return -EFAULT; +		}  	}  	if (insn->code == (BPF_JMP | BPF_CALL) && @@ -7409,7 +7770,7 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,  	 * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);  	 */  	__mark_reg_not_init(env, &callee->regs[BPF_REG_0]); -	mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); +	mark_dynptr_cb_reg(env, &callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);  	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];  	/* unused */ @@ -7422,6 +7783,63 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,  	return 0;  } +static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, +					 struct bpf_func_state *caller, +					 struct bpf_func_state *callee, +					 int insn_idx) +{ +	/* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, +	 *                     bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)); +	 * +	 * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset +	 * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd +	 * by this point, so look at 'root' +	 */ +	struct btf_field *field; + +	field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off, +				      BPF_RB_ROOT); +	if (!field || !field->graph_root.value_btf_id) +		return -EFAULT; + +	mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root); +	ref_set_non_owning(env, &callee->regs[BPF_REG_1]); +	mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root); +	ref_set_non_owning(env, &callee->regs[BPF_REG_2]); + +	__mark_reg_not_init(env, &callee->regs[BPF_REG_3]); +	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]); +	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]); +	callee->in_callback_fn = true; +	callee->callback_ret_range = tnum_range(0, 1); +	return 0; +} + +static bool is_rbtree_lock_required_kfunc(u32 btf_id); + +/* Are we currently verifying the callback for a rbtree helper that must + * be called with lock held? If so, no need to complain about unreleased + * lock + */ +static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) +{ +	struct bpf_verifier_state *state = env->cur_state; +	struct bpf_insn *insn = env->prog->insnsi; +	struct bpf_func_state *callee; +	int kfunc_btf_id; + +	if (!state->curframe) +		return false; + +	callee = state->frame[state->curframe]; + +	if (!callee->in_callback_fn) +		return false; + +	kfunc_btf_id = insn[callee->callsite].imm; +	return is_rbtree_lock_required_kfunc(kfunc_btf_id); +} +  static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)  {  	struct bpf_verifier_state *state = env->cur_state; @@ -7614,6 +8032,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,  	struct bpf_reg_state *fmt_reg = ®s[BPF_REG_3];  	struct bpf_reg_state *data_len_reg = ®s[BPF_REG_5];  	struct bpf_map *fmt_map = fmt_reg->map_ptr; +	struct bpf_bprintf_data data = {};  	int err, fmt_map_off, num_args;  	u64 fmt_addr;  	char *fmt; @@ -7638,7 +8057,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,  	/* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we  	 * can focus on validating the format specifiers.  	 */ -	err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args); +	err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, num_args, &data);  	if (err < 0)  		verbose(env, "Invalid format string\n"); @@ -7914,13 +8333,32 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {  			if (arg_type_is_dynptr(fn->arg_type[i])) {  				struct bpf_reg_state *reg = ®s[BPF_REG_1 + i]; +				int id, ref_obj_id; + +				if (meta.dynptr_id) { +					verbose(env, "verifier internal error: meta.dynptr_id already set\n"); +					return -EFAULT; +				}  				if (meta.ref_obj_id) {  					verbose(env, "verifier internal error: meta.ref_obj_id already set\n");  					return -EFAULT;  				} -				meta.ref_obj_id = dynptr_ref_obj_id(env, reg); +				id = dynptr_id(env, reg); +				if (id < 0) { +					verbose(env, "verifier internal error: failed to obtain dynptr id\n"); +					return id; +				} + +				ref_obj_id = dynptr_ref_obj_id(env, reg); +				if (ref_obj_id < 0) { +					verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); +					return ref_obj_id; +				} + +				meta.dynptr_id = id; +				meta.ref_obj_id = ref_obj_id;  				break;  			}  		} @@ -8076,6 +8514,9 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn  		return -EFAULT;  	} +	if (is_dynptr_ref_function(func_id)) +		regs[BPF_REG_0].dynptr_id = meta.dynptr_id; +  	if (is_ptr_cast_function(func_id) || is_dynptr_ref_function(func_id)) {  		/* For release_reference() */  		regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; @@ -8167,6 +8608,7 @@ struct bpf_kfunc_call_arg_meta {  	bool r0_rdonly;  	u32 ret_btf_id;  	u64 r0_size; +	u32 subprogno;  	struct {  		u64 value;  		bool found; @@ -8178,6 +8620,9 @@ struct bpf_kfunc_call_arg_meta {  	struct {  		struct btf_field *field;  	} arg_list_head; +	struct { +		struct btf_field *field; +	} arg_rbtree_root;  };  static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) @@ -8289,12 +8734,16 @@ enum {  	KF_ARG_DYNPTR_ID,  	KF_ARG_LIST_HEAD_ID,  	KF_ARG_LIST_NODE_ID, +	KF_ARG_RB_ROOT_ID, +	KF_ARG_RB_NODE_ID,  };  BTF_ID_LIST(kf_arg_btf_ids)  BTF_ID(struct, bpf_dynptr_kern)  BTF_ID(struct, bpf_list_head)  BTF_ID(struct, bpf_list_node) +BTF_ID(struct, bpf_rb_root) +BTF_ID(struct, bpf_rb_node)  static bool __is_kfunc_ptr_arg_type(const struct btf *btf,  				    const struct btf_param *arg, int type) @@ -8328,6 +8777,28 @@ static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param  	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);  } +static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg) +{ +	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID); +} + +static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg) +{ +	return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); +} + +static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, +				  const struct btf_param *arg) +{ +	const struct btf_type *t; + +	t = btf_type_resolve_func_ptr(btf, arg->type, NULL); +	if (!t) +		return false; + +	return true; +} +  /* Returns true if struct is composed of scalars, 4 levels of nesting allowed */  static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,  					const struct btf *btf, @@ -8387,6 +8858,9 @@ enum kfunc_ptr_arg_type {  	KF_ARG_PTR_TO_BTF_ID,	     /* Also covers reg2btf_ids conversions */  	KF_ARG_PTR_TO_MEM,  	KF_ARG_PTR_TO_MEM_SIZE,	     /* Size derived from next argument, skip it */ +	KF_ARG_PTR_TO_CALLBACK, +	KF_ARG_PTR_TO_RB_ROOT, +	KF_ARG_PTR_TO_RB_NODE,  };  enum special_kfunc_type { @@ -8400,6 +8874,9 @@ enum special_kfunc_type {  	KF_bpf_rdonly_cast,  	KF_bpf_rcu_read_lock,  	KF_bpf_rcu_read_unlock, +	KF_bpf_rbtree_remove, +	KF_bpf_rbtree_add, +	KF_bpf_rbtree_first,  };  BTF_SET_START(special_kfunc_set) @@ -8411,6 +8888,9 @@ BTF_ID(func, bpf_list_pop_front)  BTF_ID(func, bpf_list_pop_back)  BTF_ID(func, bpf_cast_to_kern_ctx)  BTF_ID(func, bpf_rdonly_cast) +BTF_ID(func, bpf_rbtree_remove) +BTF_ID(func, bpf_rbtree_add) +BTF_ID(func, bpf_rbtree_first)  BTF_SET_END(special_kfunc_set)  BTF_ID_LIST(special_kfunc_list) @@ -8424,6 +8904,9 @@ BTF_ID(func, bpf_cast_to_kern_ctx)  BTF_ID(func, bpf_rdonly_cast)  BTF_ID(func, bpf_rcu_read_lock)  BTF_ID(func, bpf_rcu_read_unlock) +BTF_ID(func, bpf_rbtree_remove) +BTF_ID(func, bpf_rbtree_add) +BTF_ID(func, bpf_rbtree_first)  static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)  { @@ -8485,6 +8968,12 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,  	if (is_kfunc_arg_list_node(meta->btf, &args[argno]))  		return KF_ARG_PTR_TO_LIST_NODE; +	if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno])) +		return KF_ARG_PTR_TO_RB_ROOT; + +	if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) +		return KF_ARG_PTR_TO_RB_NODE; +  	if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {  		if (!btf_type_is_struct(ref_t)) {  			verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -8494,6 +8983,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,  		return KF_ARG_PTR_TO_BTF_ID;  	} +	if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) +		return KF_ARG_PTR_TO_CALLBACK; +  	if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1]))  		arg_mem_size = true; @@ -8532,9 +9024,37 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,  		reg_ref_id = *reg2btf_ids[base_type(reg->type)];  	} -	if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id)) +	/* Enforce strict type matching for calls to kfuncs that are acquiring +	 * or releasing a reference, or are no-cast aliases. We do _not_ +	 * enforce strict matching for plain KF_TRUSTED_ARGS kfuncs by default, +	 * as we want to enable BPF programs to pass types that are bitwise +	 * equivalent without forcing them to explicitly cast with something +	 * like bpf_cast_to_kern_ctx(). +	 * +	 * For example, say we had a type like the following: +	 * +	 * struct bpf_cpumask { +	 *	cpumask_t cpumask; +	 *	refcount_t usage; +	 * }; +	 * +	 * Note that as specified in <linux/cpumask.h>, cpumask_t is typedef'ed +	 * to a struct cpumask, so it would be safe to pass a struct +	 * bpf_cpumask * to a kfunc expecting a struct cpumask *. +	 * +	 * The philosophy here is similar to how we allow scalars of different +	 * types to be passed to kfuncs as long as the size is the same. The +	 * only difference here is that we're simply allowing +	 * btf_struct_ids_match() to walk the struct at the 0th offset, and +	 * resolve types. +	 */ +	if (is_kfunc_acquire(meta) || +	    (is_kfunc_release(meta) && reg->ref_obj_id) || +	    btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))  		strict_type_match = true; +	WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off); +  	reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id);  	reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);  	if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) { @@ -8580,38 +9100,54 @@ static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,  	return 0;  } -static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id) +static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)  { -	struct bpf_func_state *state = cur_func(env); +	struct bpf_verifier_state *state = env->cur_state; + +	if (!state->active_lock.ptr) { +		verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); +		return -EFAULT; +	} + +	if (type_flag(reg->type) & NON_OWN_REF) { +		verbose(env, "verifier internal error: NON_OWN_REF already set\n"); +		return -EFAULT; +	} + +	reg->type |= NON_OWN_REF; +	return 0; +} + +static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id) +{ +	struct bpf_func_state *state, *unused;  	struct bpf_reg_state *reg;  	int i; -	/* bpf_spin_lock only allows calling list_push and list_pop, no BPF -	 * subprogs, no global functions. This means that the references would -	 * not be released inside the critical section but they may be added to -	 * the reference state, and the acquired_refs are never copied out for a -	 * different frame as BPF to BPF calls don't work in bpf_spin_lock -	 * critical sections. -	 */ +	state = cur_func(env); +  	if (!ref_obj_id) { -		verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n"); +		verbose(env, "verifier internal error: ref_obj_id is zero for " +			     "owning -> non-owning conversion\n");  		return -EFAULT;  	} +  	for (i = 0; i < state->acquired_refs; i++) { -		if (state->refs[i].id == ref_obj_id) { -			if (state->refs[i].release_on_unlock) { -				verbose(env, "verifier internal error: expected false release_on_unlock"); -				return -EFAULT; +		if (state->refs[i].id != ref_obj_id) +			continue; + +		/* Clear ref_obj_id here so release_reference doesn't clobber +		 * the whole reg +		 */ +		bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ +			if (reg->ref_obj_id == ref_obj_id) { +				reg->ref_obj_id = 0; +				ref_set_non_owning(env, reg);  			} -			state->refs[i].release_on_unlock = true; -			/* Now mark everyone sharing same ref_obj_id as untrusted */ -			bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ -				if (reg->ref_obj_id == ref_obj_id) -					reg->type |= PTR_UNTRUSTED; -			})); -			return 0; -		} +		})); +		return 0;  	} +  	verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");  	return -EFAULT;  } @@ -8697,101 +9233,226 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)  	       btf_id == special_kfunc_list[KF_bpf_list_pop_back];  } -static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, -					   struct bpf_reg_state *reg, u32 regno, -					   struct bpf_kfunc_call_arg_meta *meta) +static bool is_bpf_rbtree_api_kfunc(u32 btf_id) +{ +	return btf_id == special_kfunc_list[KF_bpf_rbtree_add] || +	       btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || +	       btf_id == special_kfunc_list[KF_bpf_rbtree_first]; +} + +static bool is_bpf_graph_api_kfunc(u32 btf_id) +{ +	return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id); +} + +static bool is_callback_calling_kfunc(u32 btf_id) +{ +	return btf_id == special_kfunc_list[KF_bpf_rbtree_add]; +} + +static bool is_rbtree_lock_required_kfunc(u32 btf_id) +{ +	return is_bpf_rbtree_api_kfunc(btf_id); +} + +static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env, +					  enum btf_field_type head_field_type, +					  u32 kfunc_btf_id) +{ +	bool ret; + +	switch (head_field_type) { +	case BPF_LIST_HEAD: +		ret = is_bpf_list_api_kfunc(kfunc_btf_id); +		break; +	case BPF_RB_ROOT: +		ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id); +		break; +	default: +		verbose(env, "verifier internal error: unexpected graph root argument type %s\n", +			btf_field_type_name(head_field_type)); +		return false; +	} + +	if (!ret) +		verbose(env, "verifier internal error: %s head arg for unknown kfunc\n", +			btf_field_type_name(head_field_type)); +	return ret; +} + +static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, +					  enum btf_field_type node_field_type, +					  u32 kfunc_btf_id)  { +	bool ret; + +	switch (node_field_type) { +	case BPF_LIST_NODE: +		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] || +		       kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]); +		break; +	case BPF_RB_NODE: +		ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || +		       kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]); +		break; +	default: +		verbose(env, "verifier internal error: unexpected graph node argument type %s\n", +			btf_field_type_name(node_field_type)); +		return false; +	} + +	if (!ret) +		verbose(env, "verifier internal error: %s node arg for unknown kfunc\n", +			btf_field_type_name(node_field_type)); +	return ret; +} + +static int +__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, +				   struct bpf_reg_state *reg, u32 regno, +				   struct bpf_kfunc_call_arg_meta *meta, +				   enum btf_field_type head_field_type, +				   struct btf_field **head_field) +{ +	const char *head_type_name;  	struct btf_field *field;  	struct btf_record *rec; -	u32 list_head_off; +	u32 head_off; -	if (meta->btf != btf_vmlinux || !is_bpf_list_api_kfunc(meta->func_id)) { -		verbose(env, "verifier internal error: bpf_list_head argument for unknown kfunc\n"); +	if (meta->btf != btf_vmlinux) { +		verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");  		return -EFAULT;  	} +	if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id)) +		return -EFAULT; + +	head_type_name = btf_field_type_name(head_field_type);  	if (!tnum_is_const(reg->var_off)) {  		verbose(env, -			"R%d doesn't have constant offset. bpf_list_head has to be at the constant offset\n", -			regno); +			"R%d doesn't have constant offset. %s has to be at the constant offset\n", +			regno, head_type_name);  		return -EINVAL;  	}  	rec = reg_btf_record(reg); -	list_head_off = reg->off + reg->var_off.value; -	field = btf_record_find(rec, list_head_off, BPF_LIST_HEAD); +	head_off = reg->off + reg->var_off.value; +	field = btf_record_find(rec, head_off, head_field_type);  	if (!field) { -		verbose(env, "bpf_list_head not found at offset=%u\n", list_head_off); +		verbose(env, "%s not found at offset=%u\n", head_type_name, head_off);  		return -EINVAL;  	}  	/* All functions require bpf_list_head to be protected using a bpf_spin_lock */  	if (check_reg_allocation_locked(env, reg)) { -		verbose(env, "bpf_spin_lock at off=%d must be held for bpf_list_head\n", -			rec->spin_lock_off); +		verbose(env, "bpf_spin_lock at off=%d must be held for %s\n", +			rec->spin_lock_off, head_type_name);  		return -EINVAL;  	} -	if (meta->arg_list_head.field) { -		verbose(env, "verifier internal error: repeating bpf_list_head arg\n"); +	if (*head_field) { +		verbose(env, "verifier internal error: repeating %s arg\n", head_type_name);  		return -EFAULT;  	} -	meta->arg_list_head.field = field; +	*head_field = field;  	return 0;  } -static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, +static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,  					   struct bpf_reg_state *reg, u32 regno,  					   struct bpf_kfunc_call_arg_meta *meta)  { +	return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD, +							  &meta->arg_list_head.field); +} + +static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env, +					     struct bpf_reg_state *reg, u32 regno, +					     struct bpf_kfunc_call_arg_meta *meta) +{ +	return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT, +							  &meta->arg_rbtree_root.field); +} + +static int +__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, +				   struct bpf_reg_state *reg, u32 regno, +				   struct bpf_kfunc_call_arg_meta *meta, +				   enum btf_field_type head_field_type, +				   enum btf_field_type node_field_type, +				   struct btf_field **node_field) +{ +	const char *node_type_name;  	const struct btf_type *et, *t;  	struct btf_field *field; -	struct btf_record *rec; -	u32 list_node_off; +	u32 node_off; -	if (meta->btf != btf_vmlinux || -	    (meta->func_id != special_kfunc_list[KF_bpf_list_push_front] && -	     meta->func_id != special_kfunc_list[KF_bpf_list_push_back])) { -		verbose(env, "verifier internal error: bpf_list_node argument for unknown kfunc\n"); +	if (meta->btf != btf_vmlinux) { +		verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n");  		return -EFAULT;  	} +	if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id)) +		return -EFAULT; + +	node_type_name = btf_field_type_name(node_field_type);  	if (!tnum_is_const(reg->var_off)) {  		verbose(env, -			"R%d doesn't have constant offset. bpf_list_node has to be at the constant offset\n", -			regno); +			"R%d doesn't have constant offset. %s has to be at the constant offset\n", +			regno, node_type_name);  		return -EINVAL;  	} -	rec = reg_btf_record(reg); -	list_node_off = reg->off + reg->var_off.value; -	field = btf_record_find(rec, list_node_off, BPF_LIST_NODE); -	if (!field || field->offset != list_node_off) { -		verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off); +	node_off = reg->off + reg->var_off.value; +	field = reg_find_field_offset(reg, node_off, node_field_type); +	if (!field || field->offset != node_off) { +		verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);  		return -EINVAL;  	} -	field = meta->arg_list_head.field; +	field = *node_field; -	et = btf_type_by_id(field->list_head.btf, field->list_head.value_btf_id); +	et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id);  	t = btf_type_by_id(reg->btf, reg->btf_id); -	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->list_head.btf, -				  field->list_head.value_btf_id, true)) { -		verbose(env, "operation on bpf_list_head expects arg#1 bpf_list_node at offset=%d " +	if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf, +				  field->graph_root.value_btf_id, true)) { +		verbose(env, "operation on %s expects arg#1 %s at offset=%d "  			"in struct %s, but arg is at offset=%d in struct %s\n", -			field->list_head.node_offset, btf_name_by_offset(field->list_head.btf, et->name_off), -			list_node_off, btf_name_by_offset(reg->btf, t->name_off)); +			btf_field_type_name(head_field_type), +			btf_field_type_name(node_field_type), +			field->graph_root.node_offset, +			btf_name_by_offset(field->graph_root.btf, et->name_off), +			node_off, btf_name_by_offset(reg->btf, t->name_off));  		return -EINVAL;  	} -	if (list_node_off != field->list_head.node_offset) { -		verbose(env, "arg#1 offset=%d, but expected bpf_list_node at offset=%d in struct %s\n", -			list_node_off, field->list_head.node_offset, -			btf_name_by_offset(field->list_head.btf, et->name_off)); +	if (node_off != field->graph_root.node_offset) { +		verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n", +			node_off, btf_field_type_name(node_field_type), +			field->graph_root.node_offset, +			btf_name_by_offset(field->graph_root.btf, et->name_off));  		return -EINVAL;  	} -	/* Set arg#1 for expiration after unlock */ -	return ref_set_release_on_unlock(env, reg->ref_obj_id); + +	return 0; +} + +static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, +					   struct bpf_reg_state *reg, u32 regno, +					   struct bpf_kfunc_call_arg_meta *meta) +{ +	return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, +						  BPF_LIST_HEAD, BPF_LIST_NODE, +						  &meta->arg_list_head.field); +} + +static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, +					     struct bpf_reg_state *reg, u32 regno, +					     struct bpf_kfunc_call_arg_meta *meta) +{ +	return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, +						  BPF_RB_ROOT, BPF_RB_NODE, +						  &meta->arg_rbtree_root.field);  }  static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta) @@ -8877,6 +9538,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  			return -EINVAL;  		} +		if (is_kfunc_trusted_args(meta) && +		    (register_is_null(reg) || type_may_be_null(reg->type))) { +			verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); +			return -EACCES; +		} +  		if (reg->ref_obj_id) {  			if (is_kfunc_release(meta) && meta->ref_obj_id) {  				verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -8922,8 +9589,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  		case KF_ARG_PTR_TO_DYNPTR:  		case KF_ARG_PTR_TO_LIST_HEAD:  		case KF_ARG_PTR_TO_LIST_NODE: +		case KF_ARG_PTR_TO_RB_ROOT: +		case KF_ARG_PTR_TO_RB_NODE:  		case KF_ARG_PTR_TO_MEM:  		case KF_ARG_PTR_TO_MEM_SIZE: +		case KF_ARG_PTR_TO_CALLBACK:  			/* Trusted by default */  			break;  		default: @@ -9000,6 +9670,20 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  			if (ret < 0)  				return ret;  			break; +		case KF_ARG_PTR_TO_RB_ROOT: +			if (reg->type != PTR_TO_MAP_VALUE && +			    reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { +				verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); +				return -EINVAL; +			} +			if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { +				verbose(env, "allocated object must be referenced\n"); +				return -EINVAL; +			} +			ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta); +			if (ret < 0) +				return ret; +			break;  		case KF_ARG_PTR_TO_LIST_NODE:  			if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {  				verbose(env, "arg#%d expected pointer to allocated object\n", i); @@ -9013,6 +9697,31 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  			if (ret < 0)  				return ret;  			break; +		case KF_ARG_PTR_TO_RB_NODE: +			if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) { +				if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { +					verbose(env, "rbtree_remove node input must be non-owning ref\n"); +					return -EINVAL; +				} +				if (in_rbtree_lock_required_cb(env)) { +					verbose(env, "rbtree_remove not allowed in rbtree cb\n"); +					return -EINVAL; +				} +			} else { +				if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { +					verbose(env, "arg#%d expected pointer to allocated object\n", i); +					return -EINVAL; +				} +				if (!reg->ref_obj_id) { +					verbose(env, "allocated object must be referenced\n"); +					return -EINVAL; +				} +			} + +			ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta); +			if (ret < 0) +				return ret; +			break;  		case KF_ARG_PTR_TO_BTF_ID:  			/* Only base_type is checked, further checks are done here */  			if ((base_type(reg->type) != PTR_TO_BTF_ID || @@ -9048,6 +9757,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_  			/* Skip next '__sz' argument */  			i++;  			break; +		case KF_ARG_PTR_TO_CALLBACK: +			meta->subprogno = reg->subprogno; +			break;  		}  	} @@ -9064,11 +9776,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			    int *insn_idx_p)  {  	const struct btf_type *t, *func, *func_proto, *ptr_type; +	u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id;  	struct bpf_reg_state *regs = cur_regs(env);  	const char *func_name, *ptr_type_name;  	bool sleepable, rcu_lock, rcu_unlock;  	struct bpf_kfunc_call_arg_meta meta; -	u32 i, nargs, func_id, ptr_type_id;  	int err, insn_idx = *insn_idx_p;  	const struct btf_param *args;  	const struct btf_type *ret_t; @@ -9163,6 +9875,35 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  		}  	} +	if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] || +	    meta.func_id == special_kfunc_list[KF_bpf_list_push_back] || +	    meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) { +		release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; +		err = ref_convert_owning_non_owning(env, release_ref_obj_id); +		if (err) { +			verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", +				func_name, func_id); +			return err; +		} + +		err = release_reference(env, release_ref_obj_id); +		if (err) { +			verbose(env, "kfunc %s#%d reference has not been acquired before\n", +				func_name, func_id); +			return err; +		} +	} + +	if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) { +		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, +					set_rbtree_add_callback_state); +		if (err) { +			verbose(env, "kfunc %s#%d failed callback verification\n", +				func_name, func_id); +			return err; +		} +	} +  	for (i = 0; i < CALLER_SAVED_REGS; i++)  		mark_reg_not_init(env, regs, caller_saved[i]); @@ -9227,11 +9968,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  				   meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {  				struct btf_field *field = meta.arg_list_head.field; -				mark_reg_known_zero(env, regs, BPF_REG_0); -				regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; -				regs[BPF_REG_0].btf = field->list_head.btf; -				regs[BPF_REG_0].btf_id = field->list_head.value_btf_id; -				regs[BPF_REG_0].off = field->list_head.node_offset; +				mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); +			} else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] || +				   meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { +				struct btf_field *field = meta.arg_rbtree_root.field; + +				mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);  			} else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {  				mark_reg_known_zero(env, regs, BPF_REG_0);  				regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; @@ -9297,7 +10039,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			if (is_kfunc_ret_null(&meta))  				regs[BPF_REG_0].id = id;  			regs[BPF_REG_0].ref_obj_id = id; +		} else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { +			ref_set_non_owning(env, ®s[BPF_REG_0]);  		} + +		if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove]) +			invalidate_non_owning_refs(env); +  		if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id)  			regs[BPF_REG_0].id = ++env->id_gen;  	} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ @@ -9584,7 +10332,7 @@ do_sim:  	 */  	if (!ptr_is_dst_reg) {  		tmp = *dst_reg; -		*dst_reg = *ptr_reg; +		copy_register_state(dst_reg, ptr_reg);  	}  	ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,  					env->insn_idx); @@ -10837,7 +11585,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  					 * to propagate min/max range.  					 */  					src_reg->id = ++env->id_gen; -				*dst_reg = *src_reg; +				copy_register_state(dst_reg, src_reg);  				dst_reg->live |= REG_LIVE_WRITTEN;  				dst_reg->subreg_def = DEF_NOT_SUBREG;  			} else { @@ -10848,7 +11596,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)  						insn->src_reg);  					return -EACCES;  				} else if (src_reg->type == SCALAR_VALUE) { -					*dst_reg = *src_reg; +					copy_register_state(dst_reg, src_reg);  					/* Make sure ID is cleared otherwise  					 * dst_reg min/max could be incorrectly  					 * propagated into src_reg by find_equal_scalars() @@ -11483,8 +12231,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,  		 */  		if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))  			return; -		if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL) && WARN_ON_ONCE(reg->off)) +		if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) && +		    WARN_ON_ONCE(reg->off))  			return; +  		if (is_null) {  			reg->type = SCALAR_VALUE;  			/* We don't need id and ref_obj_id from this point @@ -11647,7 +12397,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,  	bpf_for_each_reg_in_vstate(vstate, state, reg, ({  		if (reg->type == SCALAR_VALUE && reg->id == known_reg->id) -			*reg = *known_reg; +			copy_register_state(reg, known_reg);  	}));  } @@ -12950,6 +13700,13 @@ static bool check_ids(u32 old_id, u32 cur_id, struct bpf_id_pair *idmap)  {  	unsigned int i; +	/* either both IDs should be set or both should be zero */ +	if (!!old_id != !!cur_id) +		return false; + +	if (old_id == 0) /* cur_id == 0 as well */ +		return true; +  	for (i = 0; i < BPF_ID_MAP_SIZE; i++) {  		if (!idmap[i].old) {  			/* Reached an empty slot; haven't seen this id before */ @@ -13061,79 +13818,74 @@ next:  	}  } +static bool regs_exact(const struct bpf_reg_state *rold, +		       const struct bpf_reg_state *rcur, +		       struct bpf_id_pair *idmap) +{ +	return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&  +	       check_ids(rold->id, rcur->id, idmap) && +	       check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap); +} +  /* Returns true if (rold safe implies rcur safe) */  static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,  		    struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)  { -	bool equal; -  	if (!(rold->live & REG_LIVE_READ))  		/* explored state didn't use this */  		return true; - -	equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; -  	if (rold->type == NOT_INIT)  		/* explored state can't have used this */  		return true;  	if (rcur->type == NOT_INIT)  		return false; + +	/* Enforce that register types have to match exactly, including their +	 * modifiers (like PTR_MAYBE_NULL, MEM_RDONLY, etc), as a general +	 * rule. +	 * +	 * One can make a point that using a pointer register as unbounded +	 * SCALAR would be technically acceptable, but this could lead to +	 * pointer leaks because scalars are allowed to leak while pointers +	 * are not. We could make this safe in special cases if root is +	 * calling us, but it's probably not worth the hassle. +	 * +	 * Also, register types that are *not* MAYBE_NULL could technically be +	 * safe to use as their MAYBE_NULL variants (e.g., PTR_TO_MAP_VALUE +	 * is safe to be used as PTR_TO_MAP_VALUE_OR_NULL, provided both point +	 * to the same map). +	 * However, if the old MAYBE_NULL register then got NULL checked, +	 * doing so could have affected others with the same id, and we can't +	 * check for that because we lost the id when we converted to +	 * a non-MAYBE_NULL variant. +	 * So, as a general rule we don't allow mixing MAYBE_NULL and +	 * non-MAYBE_NULL registers as well. +	 */ +	if (rold->type != rcur->type) +		return false; +  	switch (base_type(rold->type)) {  	case SCALAR_VALUE: -		if (equal) +		if (regs_exact(rold, rcur, idmap))  			return true;  		if (env->explore_alu_limits)  			return false; -		if (rcur->type == SCALAR_VALUE) { -			if (!rold->precise) -				return true; -			/* new val must satisfy old val knowledge */ -			return range_within(rold, rcur) && -			       tnum_in(rold->var_off, rcur->var_off); -		} else { -			/* We're trying to use a pointer in place of a scalar. -			 * Even if the scalar was unbounded, this could lead to -			 * pointer leaks because scalars are allowed to leak -			 * while pointers are not. We could make this safe in -			 * special cases if root is calling us, but it's -			 * probably not worth the hassle. -			 */ -			return false; -		} +		if (!rold->precise) +			return true; +		/* new val must satisfy old val knowledge */ +		return range_within(rold, rcur) && +		       tnum_in(rold->var_off, rcur->var_off);  	case PTR_TO_MAP_KEY:  	case PTR_TO_MAP_VALUE: -		/* a PTR_TO_MAP_VALUE could be safe to use as a -		 * PTR_TO_MAP_VALUE_OR_NULL into the same map. -		 * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- -		 * checked, doing so could have affected others with the same -		 * id, and we can't check for that because we lost the id when -		 * we converted to a PTR_TO_MAP_VALUE. -		 */ -		if (type_may_be_null(rold->type)) { -			if (!type_may_be_null(rcur->type)) -				return false; -			if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) -				return false; -			/* Check our ids match any regs they're supposed to */ -			return check_ids(rold->id, rcur->id, idmap); -		} -  		/* If the new min/max/var_off satisfy the old ones and  		 * everything else matches, we are OK. -		 * 'id' is not compared, since it's only used for maps with -		 * bpf_spin_lock inside map element and in such cases if -		 * the rest of the prog is valid for one map element then -		 * it's valid for all map elements regardless of the key -		 * used in bpf_map_lookup()  		 */ -		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && +		return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&  		       range_within(rold, rcur) &&  		       tnum_in(rold->var_off, rcur->var_off) &&  		       check_ids(rold->id, rcur->id, idmap);  	case PTR_TO_PACKET_META:  	case PTR_TO_PACKET: -		if (rcur->type != rold->type) -			return false;  		/* We must have at least as much range as the old ptr  		 * did, so that any accesses which were safe before are  		 * still safe.  This is true even if old range < old off, @@ -13148,7 +13900,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,  		if (rold->off != rcur->off)  			return false;  		/* id relations must be preserved */ -		if (rold->id && !check_ids(rold->id, rcur->id, idmap)) +		if (!check_ids(rold->id, rcur->id, idmap))  			return false;  		/* new val must satisfy old val knowledge */  		return range_within(rold, rcur) && @@ -13157,15 +13909,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,  		/* two stack pointers are equal only if they're pointing to  		 * the same stack frame, since fp-8 in foo != fp-8 in bar  		 */ -		return equal && rold->frameno == rcur->frameno; +		return regs_exact(rold, rcur, idmap) && rold->frameno == rcur->frameno;  	default: -		/* Only valid matches are exact, which memcmp() */ -		return equal; +		return regs_exact(rold, rcur, idmap);  	} - -	/* Shouldn't get here; if we do, say it's not safe */ -	WARN_ON_ONCE(1); -	return false;  }  static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, @@ -13212,10 +13959,9 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,  			return false;  		if (i % BPF_REG_SIZE != BPF_REG_SIZE - 1)  			continue; -		if (!is_spilled_reg(&old->stack[spi])) -			continue; -		if (!regsafe(env, &old->stack[spi].spilled_ptr, -			     &cur->stack[spi].spilled_ptr, idmap)) +		/* Both old and cur are having same slot_type */ +		switch (old->stack[spi].slot_type[BPF_REG_SIZE - 1]) { +		case STACK_SPILL:  			/* when explored and current stack slot are both storing  			 * spilled registers, check that stored pointers types  			 * are the same as well. @@ -13226,17 +13972,48 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,  			 * such verifier states are not equivalent.  			 * return false to continue verification of this path  			 */ +			if (!regsafe(env, &old->stack[spi].spilled_ptr, +				     &cur->stack[spi].spilled_ptr, idmap)) +				return false; +			break; +		case STACK_DYNPTR: +		{ +			const struct bpf_reg_state *old_reg, *cur_reg; + +			old_reg = &old->stack[spi].spilled_ptr; +			cur_reg = &cur->stack[spi].spilled_ptr; +			if (old_reg->dynptr.type != cur_reg->dynptr.type || +			    old_reg->dynptr.first_slot != cur_reg->dynptr.first_slot || +			    !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) +				return false; +			break; +		} +		case STACK_MISC: +		case STACK_ZERO: +		case STACK_INVALID: +			continue; +		/* Ensure that new unhandled slot types return false by default */ +		default:  			return false; +		}  	}  	return true;  } -static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) +static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur, +		    struct bpf_id_pair *idmap)  { +	int i; +  	if (old->acquired_refs != cur->acquired_refs)  		return false; -	return !memcmp(old->refs, cur->refs, -		       sizeof(*old->refs) * old->acquired_refs); + +	for (i = 0; i < old->acquired_refs; i++) { +		if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap)) +			return false; +	} + +	return true;  }  /* compare two verifier states @@ -13278,7 +14055,7 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat  	if (!stacksafe(env, old, cur, env->idmap_scratch))  		return false; -	if (!refsafe(old, cur)) +	if (!refsafe(old, cur, env->idmap_scratch))  		return false;  	return true; @@ -13815,7 +14592,7 @@ static int do_check(struct bpf_verifier_env *env)  			env->prev_log_len = env->log.len_used;  		} -		if (bpf_prog_is_dev_bound(env->prog->aux)) { +		if (bpf_prog_is_offloaded(env->prog->aux)) {  			err = bpf_prog_offload_verify_insn(env, env->insn_idx,  							   env->prev_insn_idx);  			if (err) @@ -13966,7 +14743,7 @@ static int do_check(struct bpf_verifier_env *env)  					if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||  					    (insn->src_reg == BPF_PSEUDO_CALL) ||  					    (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && -					     (insn->off != 0 || !is_bpf_list_api_kfunc(insn->imm)))) { +					     (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {  						verbose(env, "function calls are not allowed while holding a lock\n");  						return -EINVAL;  					} @@ -14002,7 +14779,8 @@ static int do_check(struct bpf_verifier_env *env)  					return -EINVAL;  				} -				if (env->cur_state->active_lock.ptr) { +				if (env->cur_state->active_lock.ptr && +				    !in_rbtree_lock_required_cb(env)) {  					verbose(env, "bpf_spin_unlock is missing\n");  					return -EINVAL;  				} @@ -14264,9 +15042,10 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,  {  	enum bpf_prog_type prog_type = resolve_prog_type(prog); -	if (btf_record_has_field(map->record, BPF_LIST_HEAD)) { +	if (btf_record_has_field(map->record, BPF_LIST_HEAD) || +	    btf_record_has_field(map->record, BPF_RB_ROOT)) {  		if (is_tracing_prog_type(prog_type)) { -			verbose(env, "tracing progs cannot use bpf_list_head yet\n"); +			verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n");  			return -EINVAL;  		}  	} @@ -14295,7 +15074,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,  		}  	} -	if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && +	if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&  	    !bpf_offload_prog_map_match(prog, map)) {  		verbose(env, "offload device mismatch between prog and map\n");  		return -EINVAL; @@ -14776,7 +15555,7 @@ static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt)  	unsigned int orig_prog_len = env->prog->len;  	int err; -	if (bpf_prog_is_dev_bound(env->prog->aux)) +	if (bpf_prog_is_offloaded(env->prog->aux))  		bpf_prog_offload_remove_insns(env, off, cnt);  	err = bpf_remove_insns(env->prog, off, cnt); @@ -14857,7 +15636,7 @@ static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env)  		else  			continue; -		if (bpf_prog_is_dev_bound(env->prog->aux)) +		if (bpf_prog_is_offloaded(env->prog->aux))  			bpf_prog_offload_replace_insn(env, i, &ja);  		memcpy(insn, &ja, sizeof(ja)); @@ -15044,7 +15823,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)  		}  	} -	if (bpf_prog_is_dev_bound(env->prog->aux)) +	if (bpf_prog_is_offloaded(env->prog->aux))  		return 0;  	insn = env->prog->insnsi + delta; @@ -15444,7 +16223,7 @@ static int fixup_call_args(struct bpf_verifier_env *env)  	int err = 0;  	if (env->prog->jit_requested && -	    !bpf_prog_is_dev_bound(env->prog->aux)) { +	    !bpf_prog_is_offloaded(env->prog->aux)) {  		err = jit_subprogs(env);  		if (err == 0)  			return 0; @@ -15488,12 +16267,25 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)  {  	const struct bpf_kfunc_desc *desc; +	void *xdp_kfunc;  	if (!insn->imm) {  		verbose(env, "invalid kernel function call not eliminated in verifier pass\n");  		return -EINVAL;  	} +	*cnt = 0; + +	if (bpf_dev_bound_kfunc_id(insn->imm)) { +		xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm); +		if (xdp_kfunc) { +			insn->imm = BPF_CALL_IMM(xdp_kfunc); +			return 0; +		} + +		/* fallback to default kfunc when not supported by netdev */ +	} +  	/* insn->imm has the btf func_id. Replace it with  	 * an address (relative to __bpf_call_base).  	 */ @@ -15504,7 +16296,6 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,  		return -EFAULT;  	} -	*cnt = 0;  	insn->imm = desc->imm;  	if (insn->off)  		return 0; @@ -16430,7 +17221,7 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)  	}  	if (st_ops->check_member) { -		int err = st_ops->check_member(t, member); +		int err = st_ops->check_member(t, member, prog);  		if (err) {  			verbose(env, "attach to unsupported member %s of struct %s\n", @@ -16511,6 +17302,12 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,  	if (tgt_prog) {  		struct bpf_prog_aux *aux = tgt_prog->aux; +		if (bpf_prog_is_dev_bound(prog->aux) && +		    !bpf_prog_dev_bound_match(prog, tgt_prog)) { +			bpf_log(log, "Target program bound device mismatch"); +			return -EINVAL; +		} +  		for (i = 0; i < aux->func_info_cnt; i++)  			if (aux->func_info[i].type_id == btf_id) {  				subprog = i; @@ -16732,6 +17529,24 @@ BTF_ID(func, rcu_read_unlock_strict)  #endif  BTF_SET_END(btf_id_deny) +static bool can_be_sleepable(struct bpf_prog *prog) +{ +	if (prog->type == BPF_PROG_TYPE_TRACING) { +		switch (prog->expected_attach_type) { +		case BPF_TRACE_FENTRY: +		case BPF_TRACE_FEXIT: +		case BPF_MODIFY_RETURN: +		case BPF_TRACE_ITER: +			return true; +		default: +			return false; +		} +	} +	return prog->type == BPF_PROG_TYPE_LSM || +	       prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || +	       prog->type == BPF_PROG_TYPE_STRUCT_OPS; +} +  static int check_attach_btf_id(struct bpf_verifier_env *env)  {  	struct bpf_prog *prog = env->prog; @@ -16750,9 +17565,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)  		return -EINVAL;  	} -	if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && -	    prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) { -		verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n"); +	if (prog->aux->sleepable && !can_be_sleepable(prog)) { +		verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");  		return -EINVAL;  	} @@ -16931,7 +17745,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)  	if (ret < 0)  		goto skip_full_check; -	if (bpf_prog_is_dev_bound(env->prog->aux)) { +	if (bpf_prog_is_offloaded(env->prog->aux)) {  		ret = bpf_prog_offload_verifier_prep(env->prog);  		if (ret)  			goto skip_full_check; @@ -16944,7 +17758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)  	ret = do_check_subprogs(env);  	ret = ret ?: do_check_main(env); -	if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux)) +	if (ret == 0 && bpf_prog_is_offloaded(env->prog->aux))  		ret = bpf_prog_offload_finalize(env);  skip_full_check: @@ -16979,7 +17793,7 @@ skip_full_check:  	/* do 32-bit optimization after insn patching has done so those patched  	 * insns could be handled correctly.  	 */ -	if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { +	if (ret == 0 && !bpf_prog_is_offloaded(env->prog->aux)) {  		ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);  		env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret  								     : false; diff --git a/kernel/capability.c b/kernel/capability.c index 860fd22117c1..3e058f41df32 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -20,13 +20,6 @@  #include <linux/user_namespace.h>  #include <linux/uaccess.h> -/* - * Leveraged for setting/resetting capabilities - */ - -const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; -EXPORT_SYMBOL(__cap_empty_set); -  int file_caps_enabled = 1;  static int __init file_caps_disable(char *str) @@ -151,6 +144,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)  	pid_t pid;  	unsigned tocopy;  	kernel_cap_t pE, pI, pP; +	struct __user_cap_data_struct kdata[2];  	ret = cap_validate_magic(header, &tocopy);  	if ((dataptr == NULL) || (ret != 0)) @@ -163,42 +157,46 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)  		return -EINVAL;  	ret = cap_get_target_pid(pid, &pE, &pI, &pP); -	if (!ret) { -		struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; -		unsigned i; - -		for (i = 0; i < tocopy; i++) { -			kdata[i].effective = pE.cap[i]; -			kdata[i].permitted = pP.cap[i]; -			kdata[i].inheritable = pI.cap[i]; -		} - -		/* -		 * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, -		 * we silently drop the upper capabilities here. This -		 * has the effect of making older libcap -		 * implementations implicitly drop upper capability -		 * bits when they perform a: capget/modify/capset -		 * sequence. -		 * -		 * This behavior is considered fail-safe -		 * behavior. Upgrading the application to a newer -		 * version of libcap will enable access to the newer -		 * capabilities. -		 * -		 * An alternative would be to return an error here -		 * (-ERANGE), but that causes legacy applications to -		 * unexpectedly fail; the capget/modify/capset aborts -		 * before modification is attempted and the application -		 * fails. -		 */ -		if (copy_to_user(dataptr, kdata, tocopy -				 * sizeof(struct __user_cap_data_struct))) { -			return -EFAULT; -		} -	} +	if (ret) +		return ret; -	return ret; +	/* +	 * Annoying legacy format with 64-bit capabilities exposed +	 * as two sets of 32-bit fields, so we need to split the +	 * capability values up. +	 */ +	kdata[0].effective   = pE.val; kdata[1].effective   = pE.val >> 32; +	kdata[0].permitted   = pP.val; kdata[1].permitted   = pP.val >> 32; +	kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32; + +	/* +	 * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, +	 * we silently drop the upper capabilities here. This +	 * has the effect of making older libcap +	 * implementations implicitly drop upper capability +	 * bits when they perform a: capget/modify/capset +	 * sequence. +	 * +	 * This behavior is considered fail-safe +	 * behavior. Upgrading the application to a newer +	 * version of libcap will enable access to the newer +	 * capabilities. +	 * +	 * An alternative would be to return an error here +	 * (-ERANGE), but that causes legacy applications to +	 * unexpectedly fail; the capget/modify/capset aborts +	 * before modification is attempted and the application +	 * fails. +	 */ +	if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0]))) +		return -EFAULT; + +	return 0; +} + +static kernel_cap_t mk_kernel_cap(u32 low, u32 high) +{ +	return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };  }  /** @@ -221,8 +219,8 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)   */  SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)  { -	struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; -	unsigned i, tocopy, copybytes; +	struct __user_cap_data_struct kdata[2] = { { 0, }, }; +	unsigned tocopy, copybytes;  	kernel_cap_t inheritable, permitted, effective;  	struct cred *new;  	int ret; @@ -246,21 +244,9 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)  	if (copy_from_user(&kdata, data, copybytes))  		return -EFAULT; -	for (i = 0; i < tocopy; i++) { -		effective.cap[i] = kdata[i].effective; -		permitted.cap[i] = kdata[i].permitted; -		inheritable.cap[i] = kdata[i].inheritable; -	} -	while (i < _KERNEL_CAPABILITY_U32S) { -		effective.cap[i] = 0; -		permitted.cap[i] = 0; -		inheritable.cap[i] = 0; -		i++; -	} - -	effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; -	permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; -	inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; +	effective   = mk_kernel_cap(kdata[0].effective,   kdata[1].effective); +	permitted   = mk_kernel_cap(kdata[0].permitted,   kdata[1].permitted); +	inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);  	new = prepare_creds();  	if (!new) @@ -486,11 +472,11 @@ EXPORT_SYMBOL(file_ns_capable);   * Return true if the inode uid and gid are within the namespace.   */  bool privileged_wrt_inode_uidgid(struct user_namespace *ns, -				 struct user_namespace *mnt_userns, +				 struct mnt_idmap *idmap,  				 const struct inode *inode)  { -	return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) && -	       vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode)); +	return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) && +	       vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));  }  /** @@ -502,13 +488,13 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns,   * its own user namespace and that the given inode's uid and gid are   * mapped into the current user namespace.   */ -bool capable_wrt_inode_uidgid(struct user_namespace *mnt_userns, +bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,  			      const struct inode *inode, int cap)  {  	struct user_namespace *ns = current_user_ns();  	return ns_capable(ns, cap) && -	       privileged_wrt_inode_uidgid(ns, mnt_userns, inode); +	       privileged_wrt_inode_uidgid(ns, idmap, inode);  }  EXPORT_SYMBOL(capable_wrt_inode_uidgid); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c099cf3fa02d..935e8121b21e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5065,7 +5065,7 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)  	if (!inode)  		return -ENOMEM; -	ret = inode_permission(&init_user_ns, inode, MAY_WRITE); +	ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);  	iput(inode);  	return ret;  } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a29c0b13706b..636f1c682ac0 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1205,12 +1205,13 @@ void rebuild_sched_domains(void)  /**   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @new_cpus: the temp variable for the new effective_cpus mask   *   * Iterate through each task of @cs updating its cpus_allowed to the   * effective cpuset's.  As this function is called with cpuset_rwsem held,   * cpuset membership stays stable.   */ -static void update_tasks_cpumask(struct cpuset *cs) +static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)  {  	struct css_task_iter it;  	struct task_struct *task; @@ -1224,7 +1225,10 @@ static void update_tasks_cpumask(struct cpuset *cs)  		if (top_cs && (task->flags & PF_KTHREAD) &&  		    kthread_is_per_cpu(task))  			continue; -		set_cpus_allowed_ptr(task, cs->effective_cpus); + +		cpumask_and(new_cpus, cs->effective_cpus, +			    task_cpu_possible_mask(task)); +		set_cpus_allowed_ptr(task, new_cpus);  	}  	css_task_iter_end(&it);  } @@ -1267,7 +1271,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,  		       int turning_on);  /**   * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset - * @cpuset:  The cpuset that requests change in partition root state + * @cs:      The cpuset that requests change in partition root state   * @cmd:     Partition root state change command   * @newmask: Optional new cpumask for partcmd_update   * @tmp:     Temporary addmask and delmask @@ -1346,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,  		 * A parent can be left with no CPU as long as there is no  		 * task directly associated with the parent partition.  		 */ -		if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) && +		if (cpumask_subset(parent->effective_cpus, cs->cpus_allowed) &&  		    partition_is_populated(parent, cs))  			return PERR_NOCPUS; @@ -1509,7 +1513,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,  	spin_unlock_irq(&callback_lock);  	if (adding || deleting) -		update_tasks_cpumask(parent); +		update_tasks_cpumask(parent, tmp->new_cpus);  	/*  	 * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. @@ -1661,7 +1665,7 @@ update_parent_subparts:  		WARN_ON(!is_in_v2_mode() &&  			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); -		update_tasks_cpumask(cp); +		update_tasks_cpumask(cp, tmp->new_cpus);  		/*  		 * On legacy hierarchy, if the effective cpumask of any non- @@ -2309,7 +2313,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)  		}  	} -	update_tasks_cpumask(parent); +	update_tasks_cpumask(parent, tmpmask.new_cpus);  	if (parent->child_ecpus_count)  		update_sibling_cpumasks(parent, cs, &tmpmask); @@ -2324,6 +2328,7 @@ out:  		new_prs = -new_prs;  	spin_lock_irq(&callback_lock);  	cs->partition_root_state = new_prs; +	WRITE_ONCE(cs->prs_err, err);  	spin_unlock_irq(&callback_lock);  	/*  	 * Update child cpusets, if present. @@ -3281,8 +3286,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {  int __init cpuset_init(void)  { -	BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); -  	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));  	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));  	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); @@ -3347,7 +3350,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,  	 * as the tasks will be migrated to an ancestor.  	 */  	if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) -		update_tasks_cpumask(cs); +		update_tasks_cpumask(cs, new_cpus);  	if (mems_updated && !nodes_empty(cs->mems_allowed))  		update_tasks_nodemask(cs); @@ -3384,7 +3387,7 @@ hotplug_update_tasks(struct cpuset *cs,  	spin_unlock_irq(&callback_lock);  	if (cpus_updated) -		update_tasks_cpumask(cs); +		update_tasks_cpumask(cs, new_cpus);  	if (mems_updated)  		update_tasks_nodemask(cs);  } @@ -3691,15 +3694,38 @@ void __init cpuset_init_smp(void)   * Description: Returns the cpumask_var_t cpus_allowed of the cpuset   * attached to the specified @tsk.  Guaranteed to return some non-empty   * subset of cpu_online_mask, even if this means going outside the - * tasks cpuset. + * tasks cpuset, except when the task is in the top cpuset.   **/  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)  {  	unsigned long flags; +	struct cpuset *cs;  	spin_lock_irqsave(&callback_lock, flags); -	guarantee_online_cpus(tsk, pmask); +	rcu_read_lock(); + +	cs = task_cs(tsk); +	if (cs != &top_cpuset) +		guarantee_online_cpus(tsk, pmask); +	/* +	 * Tasks in the top cpuset won't get update to their cpumasks +	 * when a hotplug online/offline event happens. So we include all +	 * offline cpus in the allowed cpu list. +	 */ +	if ((cs == &top_cpuset) || cpumask_empty(pmask)) { +		const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); + +		/* +		 * We first exclude cpus allocated to partitions. If there is no +		 * allowable online cpu left, we fall back to all possible cpus. +		 */ +		cpumask_andnot(pmask, possible_mask, top_cpuset.subparts_cpus); +		if (!cpumask_intersects(pmask, cpu_online_mask)) +			cpumask_copy(pmask, possible_mask); +	} + +	rcu_read_unlock();  	spin_unlock_irqrestore(&callback_lock, flags);  } @@ -3879,8 +3905,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)  }  /** - * cpuset_mem_spread_node() - On which node to begin search for a file page - * cpuset_slab_spread_node() - On which node to begin search for a slab page + * cpuset_spread_node() - On which node to begin search for a page   *   * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for   * tasks in a cpuset with is_spread_page or is_spread_slab set), @@ -3904,12 +3929,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)   * is passed an offline node, it will fall back to the local node.   * See kmem_cache_alloc_node().   */ -  static int cpuset_spread_node(int *rotor)  {  	return *rotor = next_node_in(*rotor, current->mems_allowed);  } +/** + * cpuset_mem_spread_node() - On which node to begin search for a file page + */  int cpuset_mem_spread_node(void)  {  	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) @@ -3919,6 +3946,9 @@ int cpuset_mem_spread_node(void)  	return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);  } +/** + * cpuset_slab_spread_node() - On which node to begin search for a slab page + */  int cpuset_slab_spread_node(void)  {  	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) @@ -3927,7 +3957,6 @@ int cpuset_slab_spread_node(void)  	return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);  } -  EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);  /** diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 793ecff29038..831f1f472bb8 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -26,7 +26,7 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)   * rstat_cpu->updated_children list.  See the comment on top of   * cgroup_rstat_cpu definition for details.   */ -void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) +__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)  {  	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);  	unsigned long flags; @@ -231,7 +231,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)   *   * This function may block.   */ -void cgroup_rstat_flush(struct cgroup *cgrp) +__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)  {  	might_sleep(); diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 77978e372377..a09f1c19336a 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state)  			 * In this we case we don't care about any concurrency/ordering.  			 */  			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) -				atomic_set(&ct->state, state); +				arch_atomic_set(&ct->state, state);  		} else {  			/*  			 * Even if context tracking is disabled on this CPU, because it's outside @@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state)  			 */  			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {  				/* Tracking for vtime only, no concurrent RCU EQS accounting */ -				atomic_set(&ct->state, state); +				arch_atomic_set(&ct->state, state);  			} else {  				/*  				 * Tracking for vtime and RCU EQS. Make sure we don't race @@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state)  				 * RCU only requires RCU_DYNTICKS_IDX increments to be fully  				 * ordered.  				 */ -				atomic_add(state, &ct->state); +				arch_atomic_add(state, &ct->state);  			}  		}  	} @@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state)  			 * In this we case we don't care about any concurrency/ordering.  			 */  			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) -				atomic_set(&ct->state, CONTEXT_KERNEL); +				arch_atomic_set(&ct->state, CONTEXT_KERNEL);  		} else {  			if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) {  				/* Tracking for vtime only, no concurrent RCU EQS accounting */ -				atomic_set(&ct->state, CONTEXT_KERNEL); +				arch_atomic_set(&ct->state, CONTEXT_KERNEL);  			} else {  				/*  				 * Tracking for vtime and RCU EQS. Make sure we don't race @@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state)  				 * RCU only requires RCU_DYNTICKS_IDX increments to be fully  				 * ordered.  				 */ -				atomic_sub(state, &ct->state); +				arch_atomic_sub(state, &ct->state);  			}  		}  	} diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index ba4ba71facf9..b0f0d15085db 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -30,16 +30,9 @@ static int cpu_pm_notify(enum cpu_pm_event event)  {  	int ret; -	/* -	 * This introduces a RCU read critical section, which could be -	 * disfunctional in cpu idle. Copy RCU_NONIDLE code to let RCU know -	 * this. -	 */ -	ct_irq_enter_irqson();  	rcu_read_lock();  	ret = raw_notifier_call_chain(&cpu_pm_notifier.chain, event, NULL);  	rcu_read_unlock(); -	ct_irq_exit_irqson();  	return notifier_to_errno(ret);  } @@ -49,11 +42,9 @@ static int cpu_pm_notify_robust(enum cpu_pm_event event_up, enum cpu_pm_event ev  	unsigned long flags;  	int ret; -	ct_irq_enter_irqson();  	raw_spin_lock_irqsave(&cpu_pm_notifier.lock, flags);  	ret = raw_notifier_call_chain_robust(&cpu_pm_notifier.chain, event_up, event_down, NULL);  	raw_spin_unlock_irqrestore(&cpu_pm_notifier.lock, flags); -	ct_irq_exit_irqson();  	return notifier_to_errno(ret);  } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 87ef6096823f..755f5f08ab38 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,8 +455,8 @@ static int __init crash_save_vmcoreinfo_init(void)  	VMCOREINFO_OFFSET(page, lru);  	VMCOREINFO_OFFSET(page, _mapcount);  	VMCOREINFO_OFFSET(page, private); -	VMCOREINFO_OFFSET(page, compound_dtor); -	VMCOREINFO_OFFSET(page, compound_order); +	VMCOREINFO_OFFSET(folio, _folio_dtor); +	VMCOREINFO_OFFSET(folio, _folio_order);  	VMCOREINFO_OFFSET(page, compound_head);  	VMCOREINFO_OFFSET(pglist_data, node_zones);  	VMCOREINFO_OFFSET(pglist_data, nr_zones); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index a34c38bbe28f..03e3251cd9d2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -156,14 +156,6 @@ setup_io_tlb_npages(char *str)  }  early_param("swiotlb", setup_io_tlb_npages); -unsigned int swiotlb_max_segment(void) -{ -	if (!io_tlb_default_mem.nslabs) -		return 0; -	return rounddown(io_tlb_default_mem.nslabs << IO_TLB_SHIFT, PAGE_SIZE); -} -EXPORT_SYMBOL_GPL(swiotlb_max_segment); -  unsigned long swiotlb_size_or_default(void)  {  	return default_nslabs << IO_TLB_SHIFT; @@ -300,7 +292,8 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,  	return;  } -static void *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags, +static void __init *swiotlb_memblock_alloc(unsigned long nslabs, +		unsigned int flags,  		int (*remap)(void *tlb, unsigned long nslabs))  {  	size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); diff --git a/kernel/events/core.c b/kernel/events/core.c index d56328e5080e..f79fd8b87f75 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4813,19 +4813,17 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,  		cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);  		epc = &cpc->epc; - +		raw_spin_lock_irq(&ctx->lock);  		if (!epc->ctx) {  			atomic_set(&epc->refcount, 1);  			epc->embedded = 1; -			raw_spin_lock_irq(&ctx->lock);  			list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);  			epc->ctx = ctx; -			raw_spin_unlock_irq(&ctx->lock);  		} else {  			WARN_ON_ONCE(epc->ctx != ctx);  			atomic_inc(&epc->refcount);  		} - +		raw_spin_unlock_irq(&ctx->lock);  		return epc;  	} @@ -4896,33 +4894,30 @@ static void free_epc_rcu(struct rcu_head *head)  static void put_pmu_ctx(struct perf_event_pmu_context *epc)  { +	struct perf_event_context *ctx = epc->ctx;  	unsigned long flags; -	if (!atomic_dec_and_test(&epc->refcount)) +	/* +	 * XXX +	 * +	 * lockdep_assert_held(&ctx->mutex); +	 * +	 * can't because of the call-site in _free_event()/put_event() +	 * which isn't always called under ctx->mutex. +	 */ +	if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))  		return; -	if (epc->ctx) { -		struct perf_event_context *ctx = epc->ctx; +	WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); -		/* -		 * XXX -		 * -		 * lockdep_assert_held(&ctx->mutex); -		 * -		 * can't because of the call-site in _free_event()/put_event() -		 * which isn't always called under ctx->mutex. -		 */ - -		WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); -		raw_spin_lock_irqsave(&ctx->lock, flags); -		list_del_init(&epc->pmu_ctx_entry); -		epc->ctx = NULL; -		raw_spin_unlock_irqrestore(&ctx->lock, flags); -	} +	list_del_init(&epc->pmu_ctx_entry); +	epc->ctx = NULL;  	WARN_ON_ONCE(!list_empty(&epc->pinned_active));  	WARN_ON_ONCE(!list_empty(&epc->flexible_active)); +	raw_spin_unlock_irqrestore(&ctx->lock, flags); +  	if (epc->embedded)  		return; @@ -6573,7 +6568,7 @@ aux_unlock:  	 * Since pinned accounting is per vm we cannot allow fork() to copy our  	 * vma.  	 */ -	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; +	vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);  	vma->vm_ops = &perf_mmap_vmops;  	if (event->pmu->event_mapped) @@ -7046,13 +7041,20 @@ out_put:  	ring_buffer_put(rb);  } -static void __perf_event_header__init_id(struct perf_event_header *header, -					 struct perf_sample_data *data, +/* + * A set of common sample data types saved even for non-sample records + * when event->attr.sample_id_all is set. + */ +#define PERF_SAMPLE_ID_ALL  (PERF_SAMPLE_TID | PERF_SAMPLE_TIME |	\ +			     PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID |	\ +			     PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER) + +static void __perf_event_header__init_id(struct perf_sample_data *data,  					 struct perf_event *event,  					 u64 sample_type)  {  	data->type = event->attr.sample_type; -	header->size += event->id_header_size; +	data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;  	if (sample_type & PERF_SAMPLE_TID) {  		/* namespace issues */ @@ -7079,8 +7081,10 @@ void perf_event_header__init_id(struct perf_event_header *header,  				struct perf_sample_data *data,  				struct perf_event *event)  { -	if (event->attr.sample_id_all) -		__perf_event_header__init_id(header, data, event, event->attr.sample_type); +	if (event->attr.sample_id_all) { +		header->size += event->id_header_size; +		__perf_event_header__init_id(data, event, event->attr.sample_type); +	}  }  static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -7310,7 +7314,7 @@ void perf_output_sample(struct perf_output_handle *handle,  	}  	if (sample_type & PERF_SAMPLE_BRANCH_STACK) { -		if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { +		if (data->br_stack) {  			size_t size;  			size = data->br_stack->nr @@ -7554,83 +7558,68 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)  	return callchain ?: &__empty_callchain;  } -void perf_prepare_sample(struct perf_event_header *header, -			 struct perf_sample_data *data, +static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) +{ +	return d * !!(flags & s); +} + +void perf_prepare_sample(struct perf_sample_data *data,  			 struct perf_event *event,  			 struct pt_regs *regs)  {  	u64 sample_type = event->attr.sample_type;  	u64 filtered_sample_type; -	header->type = PERF_RECORD_SAMPLE; -	header->size = sizeof(*header) + event->header_size; - -	header->misc = 0; -	header->misc |= perf_misc_flags(regs); -  	/* -	 * Clear the sample flags that have already been done by the -	 * PMU driver. +	 * Add the sample flags that are dependent to others.  And clear the +	 * sample flags that have already been done by the PMU driver.  	 */ -	filtered_sample_type = sample_type & ~data->sample_flags; -	__perf_event_header__init_id(header, data, event, filtered_sample_type); - -	if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) -		data->ip = perf_instruction_pointer(regs); - -	if (sample_type & PERF_SAMPLE_CALLCHAIN) { -		int size = 1; - -		if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) -			data->callchain = perf_callchain(event, regs); +	filtered_sample_type = sample_type; +	filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, +					   PERF_SAMPLE_IP); +	filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE | +					   PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR); +	filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, +					   PERF_SAMPLE_REGS_USER); +	filtered_sample_type &= ~data->sample_flags; -		size += data->callchain->nr; - -		header->size += size * sizeof(u64); +	if (filtered_sample_type == 0) { +		/* Make sure it has the correct data->type for output */ +		data->type = event->attr.sample_type; +		return;  	} -	if (sample_type & PERF_SAMPLE_RAW) { -		struct perf_raw_record *raw = data->raw; -		int size; +	__perf_event_header__init_id(data, event, filtered_sample_type); -		if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { -			struct perf_raw_frag *frag = &raw->frag; -			u32 sum = 0; +	if (filtered_sample_type & PERF_SAMPLE_IP) { +		data->ip = perf_instruction_pointer(regs); +		data->sample_flags |= PERF_SAMPLE_IP; +	} -			do { -				sum += frag->size; -				if (perf_raw_frag_last(frag)) -					break; -				frag = frag->next; -			} while (1); +	if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) +		perf_sample_save_callchain(data, event, regs); -			size = round_up(sum + sizeof(u32), sizeof(u64)); -			raw->size = size - sizeof(u32); -			frag->pad = raw->size - sum; -		} else { -			size = sizeof(u64); -			data->raw = NULL; -		} - -		header->size += size; +	if (filtered_sample_type & PERF_SAMPLE_RAW) { +		data->raw = NULL; +		data->dyn_size += sizeof(u64); +		data->sample_flags |= PERF_SAMPLE_RAW;  	} -	if (sample_type & PERF_SAMPLE_BRANCH_STACK) { -		int size = sizeof(u64); /* nr */ -		if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { -			if (branch_sample_hw_index(event)) -				size += sizeof(u64); - -			size += data->br_stack->nr -			      * sizeof(struct perf_branch_entry); -		} -		header->size += size; +	if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) { +		data->br_stack = NULL; +		data->dyn_size += sizeof(u64); +		data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;  	} -	if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) +	if (filtered_sample_type & PERF_SAMPLE_REGS_USER)  		perf_sample_regs_user(&data->regs_user, regs); -	if (sample_type & PERF_SAMPLE_REGS_USER) { +	/* +	 * It cannot use the filtered_sample_type here as REGS_USER can be set +	 * by STACK_USER (using __cond_set() above) and we don't want to update +	 * the dyn_size if it's not requested by users. +	 */ +	if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {  		/* regs dump ABI info */  		int size = sizeof(u64); @@ -7639,10 +7628,11 @@ void perf_prepare_sample(struct perf_event_header *header,  			size += hweight64(mask) * sizeof(u64);  		} -		header->size += size; +		data->dyn_size += size; +		data->sample_flags |= PERF_SAMPLE_REGS_USER;  	} -	if (sample_type & PERF_SAMPLE_STACK_USER) { +	if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {  		/*  		 * Either we need PERF_SAMPLE_STACK_USER bit to be always  		 * processed as the last one or have additional check added @@ -7650,9 +7640,10 @@ void perf_prepare_sample(struct perf_event_header *header,  		 * up the rest of the sample size.  		 */  		u16 stack_size = event->attr.sample_stack_user; +		u16 header_size = perf_sample_data_size(data, event);  		u16 size = sizeof(u64); -		stack_size = perf_sample_ustack_size(stack_size, header->size, +		stack_size = perf_sample_ustack_size(stack_size, header_size,  						     data->regs_user.regs);  		/* @@ -7664,24 +7655,31 @@ void perf_prepare_sample(struct perf_event_header *header,  			size += sizeof(u64) + stack_size;  		data->stack_user_size = stack_size; -		header->size += size; +		data->dyn_size += size; +		data->sample_flags |= PERF_SAMPLE_STACK_USER;  	} -	if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) +	if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {  		data->weight.full = 0; +		data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; +	} -	if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) +	if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {  		data->data_src.val = PERF_MEM_NA; +		data->sample_flags |= PERF_SAMPLE_DATA_SRC; +	} -	if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) +	if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {  		data->txn = 0; +		data->sample_flags |= PERF_SAMPLE_TRANSACTION; +	} -	if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { -		if (filtered_sample_type & PERF_SAMPLE_ADDR) -			data->addr = 0; +	if (filtered_sample_type & PERF_SAMPLE_ADDR) { +		data->addr = 0; +		data->sample_flags |= PERF_SAMPLE_ADDR;  	} -	if (sample_type & PERF_SAMPLE_REGS_INTR) { +	if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {  		/* regs dump ABI info */  		int size = sizeof(u64); @@ -7693,20 +7691,23 @@ void perf_prepare_sample(struct perf_event_header *header,  			size += hweight64(mask) * sizeof(u64);  		} -		header->size += size; +		data->dyn_size += size; +		data->sample_flags |= PERF_SAMPLE_REGS_INTR;  	} -	if (sample_type & PERF_SAMPLE_PHYS_ADDR && -	    filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) +	if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {  		data->phys_addr = perf_virt_to_phys(data->addr); +		data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; +	}  #ifdef CONFIG_CGROUP_PERF -	if (sample_type & PERF_SAMPLE_CGROUP) { +	if (filtered_sample_type & PERF_SAMPLE_CGROUP) {  		struct cgroup *cgrp;  		/* protected by RCU */  		cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;  		data->cgroup = cgroup_id(cgrp); +		data->sample_flags |= PERF_SAMPLE_CGROUP;  	}  #endif @@ -7715,16 +7716,21 @@ void perf_prepare_sample(struct perf_event_header *header,  	 * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,  	 * but the value will not dump to the userspace.  	 */ -	if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) +	if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {  		data->data_page_size = perf_get_page_size(data->addr); +		data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE; +	} -	if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) +	if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {  		data->code_page_size = perf_get_page_size(data->ip); +		data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE; +	} -	if (sample_type & PERF_SAMPLE_AUX) { +	if (filtered_sample_type & PERF_SAMPLE_AUX) {  		u64 size; +		u16 header_size = perf_sample_data_size(data, event); -		header->size += sizeof(u64); /* size */ +		header_size += sizeof(u64); /* size */  		/*  		 * Given the 16bit nature of header::size, an AUX sample can @@ -7732,14 +7738,26 @@ void perf_prepare_sample(struct perf_event_header *header,  		 * Make sure this doesn't happen by using up to U16_MAX bytes  		 * per sample in total (rounded down to 8 byte boundary).  		 */ -		size = min_t(size_t, U16_MAX - header->size, +		size = min_t(size_t, U16_MAX - header_size,  			     event->attr.aux_sample_size);  		size = rounddown(size, 8);  		size = perf_prepare_sample_aux(event, data, size); -		WARN_ON_ONCE(size + header->size > U16_MAX); -		header->size += size; +		WARN_ON_ONCE(size + header_size > U16_MAX); +		data->dyn_size += size + sizeof(u64); /* size above */ +		data->sample_flags |= PERF_SAMPLE_AUX;  	} +} + +void perf_prepare_header(struct perf_event_header *header, +			 struct perf_sample_data *data, +			 struct perf_event *event, +			 struct pt_regs *regs) +{ +	header->type = PERF_RECORD_SAMPLE; +	header->size = perf_sample_data_size(data, event); +	header->misc = perf_misc_flags(regs); +  	/*  	 * If you're adding more sample types here, you likely need to do  	 * something about the overflowing header::size, like repurpose the @@ -7767,7 +7785,8 @@ __perf_event_output(struct perf_event *event,  	/* protect the callchain buffers */  	rcu_read_lock(); -	perf_prepare_sample(&header, data, event, regs); +	perf_prepare_sample(data, event, regs); +	perf_prepare_header(&header, data, event, regs);  	err = output_begin(&handle, data, event, header.size);  	if (err) @@ -9399,6 +9418,7 @@ void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)  	perf_output_end(&handle);  } +EXPORT_SYMBOL_GPL(perf_report_aux_output_id);  static int  __perf_event_account_interrupt(struct perf_event *event, int throttle) @@ -10125,8 +10145,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,  	};  	perf_sample_data_init(&data, 0, 0); -	data.raw = &raw; -	data.sample_flags |= PERF_SAMPLE_RAW; +	perf_sample_save_raw_data(&data, &raw);  	perf_trace_buf_update(record, event_type); @@ -10333,13 +10352,7 @@ static void bpf_overflow_handler(struct perf_event *event,  	rcu_read_lock();  	prog = READ_ONCE(event->prog);  	if (prog) { -		if (prog->call_get_stack && -		    (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && -		    !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { -			data->callchain = perf_callchain(event, regs); -			data->sample_flags |= PERF_SAMPLE_CALLCHAIN; -		} - +		perf_prepare_sample(data, event, regs);  		ret = bpf_prog_run(prog, &ctx);  	}  	rcu_read_unlock(); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d9e357b7e17c..59887c69d54c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -22,7 +22,6 @@  #include <linux/swap.h>		/* folio_free_swap */  #include <linux/ptrace.h>	/* user_enable_single_step */  #include <linux/kdebug.h>	/* notifier mechanism */ -#include "../../mm/internal.h"	/* munlock_vma_page */  #include <linux/percpu-rwsem.h>  #include <linux/task_work.h>  #include <linux/shmem_fs.h> @@ -161,7 +160,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	int err;  	struct mmu_notifier_range range; -	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,  				addr + PAGE_SIZE);  	if (new_page) { @@ -1352,7 +1351,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)  }  /* - * Called from mmap_region/vma_adjust with mm->mmap_lock acquired. + * Called from mmap_region/vma_merge with mm->mmap_lock acquired.   *   * Currently we ignore all errors and always return 0, the callers   * can't handle the failure anyway. diff --git a/kernel/exit.c b/kernel/exit.c index 15dc2ec80c46..f2afdb0add7c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -807,6 +807,8 @@ void __noreturn do_exit(long code)  	struct task_struct *tsk = current;  	int group_dead; +	WARN_ON(irqs_disabled()); +  	synchronize_group_exit(tsk, code);  	WARN_ON(tsk->plug); @@ -938,6 +940,11 @@ void __noreturn make_task_dead(int signr)  	if (unlikely(!tsk->pid))  		panic("Attempted to kill the idle task!"); +	if (unlikely(irqs_disabled())) { +		pr_info("note: %s[%d] exited with irqs disabled\n", +			current->comm, task_pid_nr(current)); +		local_irq_enable(); +	}  	if (unlikely(in_atomic())) {  		pr_info("note: %s[%d] exited with preempt_count %d\n",  			current->comm, task_pid_nr(current), @@ -1898,7 +1905,14 @@ bool thread_group_exited(struct pid *pid)  }  EXPORT_SYMBOL(thread_group_exited); -__weak void abort(void) +/* + * This needs to be __function_aligned as GCC implicitly makes any + * implementation of abort() cold and drops alignment specified by + * -falign-functions=N. + * + * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 + */ +__weak __function_aligned void abort(void)  {  	BUG(); diff --git a/kernel/fail_function.c b/kernel/fail_function.c index a7ccd2930c5f..d971a0189319 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -163,10 +163,7 @@ static void fei_debugfs_add_attr(struct fei_attr *attr)  static void fei_debugfs_remove_attr(struct fei_attr *attr)  { -	struct dentry *dir; - -	dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); -	debugfs_remove_recursive(dir); +	debugfs_lookup_and_remove(attr->kp.symbol_name, fei_debugfs_dir);  }  static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) diff --git a/kernel/fork.c b/kernel/fork.c index 9f7fe3541897..d8cda4c6de6c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -472,7 +472,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)  		 * orig->shared.rb may be modified concurrently, but the clone  		 * will be reinitialized.  		 */ -		*new = data_race(*orig); +		data_race(memcpy(new, orig, sizeof(*new)));  		INIT_LIST_HEAD(&new->anon_vma_chain);  		dup_anon_vma_name(orig, new);  	} @@ -585,8 +585,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,  	int retval;  	unsigned long charge = 0;  	LIST_HEAD(uf); -	MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); -	MA_STATE(mas, &mm->mm_mt, 0, 0); +	VMA_ITERATOR(old_vmi, oldmm, 0); +	VMA_ITERATOR(vmi, mm, 0);  	uprobe_start_dup_mmap();  	if (mmap_write_lock_killable(oldmm)) { @@ -613,11 +613,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,  		goto out;  	khugepaged_fork(mm, oldmm); -	retval = mas_expected_entries(&mas, oldmm->map_count); +	retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);  	if (retval)  		goto out; -	mas_for_each(&old_mas, mpnt, ULONG_MAX) { +	for_each_vma(old_vmi, mpnt) {  		struct file *file;  		if (mpnt->vm_flags & VM_DONTCOPY) { @@ -659,7 +659,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,  			tmp->anon_vma = NULL;  		} else if (anon_vma_fork(tmp, mpnt))  			goto fail_nomem_anon_vma_fork; -		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); +		vm_flags_clear(tmp, VM_LOCKED_MASK);  		file = tmp->vm_file;  		if (file) {  			struct address_space *mapping = file->f_mapping; @@ -683,11 +683,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,  			hugetlb_dup_vma_private(tmp);  		/* Link the vma into the MT */ -		mas.index = tmp->vm_start; -		mas.last = tmp->vm_end - 1; -		mas_store(&mas, tmp); -		if (mas_is_err(&mas)) -			goto fail_nomem_mas_store; +		if (vma_iter_bulk_store(&vmi, tmp)) +			goto fail_nomem_vmi_store;  		mm->map_count++;  		if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -702,7 +699,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,  	/* a new mm has just been created */  	retval = arch_dup_mmap(oldmm, mm);  loop_out: -	mas_destroy(&mas); +	vma_iter_free(&vmi);  out:  	mmap_write_unlock(mm);  	flush_tlb_mm(oldmm); @@ -712,7 +709,7 @@ fail_uprobe_end:  	uprobe_end_dup_mmap();  	return retval; -fail_nomem_mas_store: +fail_nomem_vmi_store:  	unlink_anon_vmas(tmp);  fail_nomem_anon_vma_fork:  	mpol_put(vma_policy(tmp)); @@ -1044,7 +1041,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  #endif  #ifdef CONFIG_BLK_CGROUP -	tsk->throttle_queue = NULL; +	tsk->throttle_disk = NULL;  	tsk->use_memdelay = 0;  #endif @@ -1060,6 +1057,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  	tsk->reported_split_lock = 0;  #endif +#ifdef CONFIG_SCHED_MM_CID +	tsk->mm_cid = -1; +	tsk->mm_cid_active = 0; +#endif  	return tsk;  free_stack: @@ -1169,6 +1170,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,  	mm->user_ns = get_user_ns(user_ns);  	lru_gen_init_mm(mm); +	mm_init_cid(mm);  	return mm;  fail_pcpu: @@ -1601,6 +1603,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)  	tsk->mm = mm;  	tsk->active_mm = mm; +	sched_mm_cid_fork(tsk);  	return 0;  } @@ -2933,7 +2936,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)  	 * - make the CLONE_DETACHED bit reusable for clone3  	 * - make the CSIGNAL bits reusable for clone3  	 */ -	if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) +	if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))  		return false;  	if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == @@ -3034,7 +3037,7 @@ void __init mm_cache_init(void)  	 * dynamically sized based on the maximum CPU number this system  	 * can have, taking hotplug into account (nr_cpu_ids).  	 */ -	mm_size = sizeof(struct mm_struct) + cpumask_size(); +	mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();  	mm_cachep = kmem_cache_create_usercopy("mm_struct",  			mm_size, ARCH_MIN_MMSTRUCT_ALIGN, diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 473036b43c83..1ef9a87511f5 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -7,13 +7,15 @@ set -e  sfile="$(readlink -f "$0")"  outdir="$(pwd)"  tarfile=$1 -cpio_dir=$outdir/$tarfile.tmp +cpio_dir=$outdir/${tarfile%/*}/.tmp_cpio_dir  dir_list="  include/  arch/$SRCARCH/include/  " +type cpio > /dev/null +  # Support incremental builds by skipping archive generation  # if timestamps of files being archived are not changed. diff --git a/kernel/hung_task.c b/kernel/hung_task.c index c71889f3f3fc..322813366c6c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -142,6 +142,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)  		if (sysctl_hung_task_all_cpu_backtrace)  			hung_task_show_all_bt = true; +		if (!sysctl_hung_task_warnings) +			pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");  	}  	touch_nmi_watchdog(); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index b64c44ae4c25..2531f3496ab6 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI  	depends on SMP  	select IRQ_DOMAIN_HIERARCHY +# Generic IRQ IPI Mux support +config GENERIC_IRQ_IPI_MUX +	bool +	depends on SMP +  # Generic MSI hierarchical interrupt domain support  config GENERIC_MSI_IRQ  	bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index b4f53717d143..f19d3080bf11 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o  obj-$(CONFIG_PM_SLEEP) += pm.o  obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o  obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o +obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o  obj-$(CONFIG_SMP) += affinity.o  obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o  obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d9a5c1d65a79..44a4eba80315 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -7,398 +7,7 @@  #include <linux/kernel.h>  #include <linux/slab.h>  #include <linux/cpu.h> -#include <linux/sort.h> - -static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, -				unsigned int cpus_per_vec) -{ -	const struct cpumask *siblmsk; -	int cpu, sibl; - -	for ( ; cpus_per_vec > 0; ) { -		cpu = cpumask_first(nmsk); - -		/* Should not happen, but I'm too lazy to think about it */ -		if (cpu >= nr_cpu_ids) -			return; - -		cpumask_clear_cpu(cpu, nmsk); -		cpumask_set_cpu(cpu, irqmsk); -		cpus_per_vec--; - -		/* If the cpu has siblings, use them first */ -		siblmsk = topology_sibling_cpumask(cpu); -		for (sibl = -1; cpus_per_vec > 0; ) { -			sibl = cpumask_next(sibl, siblmsk); -			if (sibl >= nr_cpu_ids) -				break; -			if (!cpumask_test_and_clear_cpu(sibl, nmsk)) -				continue; -			cpumask_set_cpu(sibl, irqmsk); -			cpus_per_vec--; -		} -	} -} - -static cpumask_var_t *alloc_node_to_cpumask(void) -{ -	cpumask_var_t *masks; -	int node; - -	masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); -	if (!masks) -		return NULL; - -	for (node = 0; node < nr_node_ids; node++) { -		if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) -			goto out_unwind; -	} - -	return masks; - -out_unwind: -	while (--node >= 0) -		free_cpumask_var(masks[node]); -	kfree(masks); -	return NULL; -} - -static void free_node_to_cpumask(cpumask_var_t *masks) -{ -	int node; - -	for (node = 0; node < nr_node_ids; node++) -		free_cpumask_var(masks[node]); -	kfree(masks); -} - -static void build_node_to_cpumask(cpumask_var_t *masks) -{ -	int cpu; - -	for_each_possible_cpu(cpu) -		cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); -} - -static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, -				const struct cpumask *mask, nodemask_t *nodemsk) -{ -	int n, nodes = 0; - -	/* Calculate the number of nodes in the supplied affinity mask */ -	for_each_node(n) { -		if (cpumask_intersects(mask, node_to_cpumask[n])) { -			node_set(n, *nodemsk); -			nodes++; -		} -	} -	return nodes; -} - -struct node_vectors { -	unsigned id; - -	union { -		unsigned nvectors; -		unsigned ncpus; -	}; -}; - -static int ncpus_cmp_func(const void *l, const void *r) -{ -	const struct node_vectors *ln = l; -	const struct node_vectors *rn = r; - -	return ln->ncpus - rn->ncpus; -} - -/* - * Allocate vector number for each node, so that for each node: - * - * 1) the allocated number is >= 1 - * - * 2) the allocated numbver is <= active CPU number of this node - * - * The actual allocated total vectors may be less than @numvecs when - * active total CPU number is less than @numvecs. - * - * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' - * for each node. - */ -static void alloc_nodes_vectors(unsigned int numvecs, -				cpumask_var_t *node_to_cpumask, -				const struct cpumask *cpu_mask, -				const nodemask_t nodemsk, -				struct cpumask *nmsk, -				struct node_vectors *node_vectors) -{ -	unsigned n, remaining_ncpus = 0; - -	for (n = 0; n < nr_node_ids; n++) { -		node_vectors[n].id = n; -		node_vectors[n].ncpus = UINT_MAX; -	} - -	for_each_node_mask(n, nodemsk) { -		unsigned ncpus; - -		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); -		ncpus = cpumask_weight(nmsk); - -		if (!ncpus) -			continue; -		remaining_ncpus += ncpus; -		node_vectors[n].ncpus = ncpus; -	} - -	numvecs = min_t(unsigned, remaining_ncpus, numvecs); - -	sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), -	     ncpus_cmp_func, NULL); - -	/* -	 * Allocate vectors for each node according to the ratio of this -	 * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is -	 * bigger than number of active numa nodes. Always start the -	 * allocation from the node with minimized nr_cpus. -	 * -	 * This way guarantees that each active node gets allocated at -	 * least one vector, and the theory is simple: over-allocation -	 * is only done when this node is assigned by one vector, so -	 * other nodes will be allocated >= 1 vector, since 'numvecs' is -	 * bigger than number of numa nodes. -	 * -	 * One perfect invariant is that number of allocated vectors for -	 * each node is <= CPU count of this node: -	 * -	 * 1) suppose there are two nodes: A and B -	 * 	ncpu(X) is CPU count of node X -	 * 	vecs(X) is the vector count allocated to node X via this -	 * 	algorithm -	 * -	 * 	ncpu(A) <= ncpu(B) -	 * 	ncpu(A) + ncpu(B) = N -	 * 	vecs(A) + vecs(B) = V -	 * -	 * 	vecs(A) = max(1, round_down(V * ncpu(A) / N)) -	 * 	vecs(B) = V - vecs(A) -	 * -	 * 	both N and V are integer, and 2 <= V <= N, suppose -	 * 	V = N - delta, and 0 <= delta <= N - 2 -	 * -	 * 2) obviously vecs(A) <= ncpu(A) because: -	 * -	 * 	if vecs(A) is 1, then vecs(A) <= ncpu(A) given -	 * 	ncpu(A) >= 1 -	 * -	 * 	otherwise, -	 * 		vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N -	 * -	 * 3) prove how vecs(B) <= ncpu(B): -	 * -	 * 	if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be -	 * 	over-allocated, so vecs(B) <= ncpu(B), -	 * -	 * 	otherwise: -	 * -	 * 	vecs(A) = -	 * 		round_down(V * ncpu(A) / N) = -	 * 		round_down((N - delta) * ncpu(A) / N) = -	 * 		round_down((N * ncpu(A) - delta * ncpu(A)) / N)	 >= -	 * 		round_down((N * ncpu(A) - delta * N) / N)	 = -	 * 		cpu(A) - delta -	 * -	 * 	then: -	 * -	 * 	vecs(A) - V >= ncpu(A) - delta - V -	 * 	=> -	 * 	V - vecs(A) <= V + delta - ncpu(A) -	 * 	=> -	 * 	vecs(B) <= N - ncpu(A) -	 * 	=> -	 * 	vecs(B) <= cpu(B) -	 * -	 * For nodes >= 3, it can be thought as one node and another big -	 * node given that is exactly what this algorithm is implemented, -	 * and we always re-calculate 'remaining_ncpus' & 'numvecs', and -	 * finally for each node X: vecs(X) <= ncpu(X). -	 * -	 */ -	for (n = 0; n < nr_node_ids; n++) { -		unsigned nvectors, ncpus; - -		if (node_vectors[n].ncpus == UINT_MAX) -			continue; - -		WARN_ON_ONCE(numvecs == 0); - -		ncpus = node_vectors[n].ncpus; -		nvectors = max_t(unsigned, 1, -				 numvecs * ncpus / remaining_ncpus); -		WARN_ON_ONCE(nvectors > ncpus); - -		node_vectors[n].nvectors = nvectors; - -		remaining_ncpus -= ncpus; -		numvecs -= nvectors; -	} -} - -static int __irq_build_affinity_masks(unsigned int startvec, -				      unsigned int numvecs, -				      unsigned int firstvec, -				      cpumask_var_t *node_to_cpumask, -				      const struct cpumask *cpu_mask, -				      struct cpumask *nmsk, -				      struct irq_affinity_desc *masks) -{ -	unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; -	unsigned int last_affv = firstvec + numvecs; -	unsigned int curvec = startvec; -	nodemask_t nodemsk = NODE_MASK_NONE; -	struct node_vectors *node_vectors; - -	if (cpumask_empty(cpu_mask)) -		return 0; - -	nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); - -	/* -	 * If the number of nodes in the mask is greater than or equal the -	 * number of vectors we just spread the vectors across the nodes. -	 */ -	if (numvecs <= nodes) { -		for_each_node_mask(n, nodemsk) { -			/* Ensure that only CPUs which are in both masks are set */ -			cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); -			cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); -			if (++curvec == last_affv) -				curvec = firstvec; -		} -		return numvecs; -	} - -	node_vectors = kcalloc(nr_node_ids, -			       sizeof(struct node_vectors), -			       GFP_KERNEL); -	if (!node_vectors) -		return -ENOMEM; - -	/* allocate vector number for each node */ -	alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, -			    nodemsk, nmsk, node_vectors); - -	for (i = 0; i < nr_node_ids; i++) { -		unsigned int ncpus, v; -		struct node_vectors *nv = &node_vectors[i]; - -		if (nv->nvectors == UINT_MAX) -			continue; - -		/* Get the cpus on this node which are in the mask */ -		cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); -		ncpus = cpumask_weight(nmsk); -		if (!ncpus) -			continue; - -		WARN_ON_ONCE(nv->nvectors > ncpus); - -		/* Account for rounding errors */ -		extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); - -		/* Spread allocated vectors on CPUs of the current node */ -		for (v = 0; v < nv->nvectors; v++, curvec++) { -			cpus_per_vec = ncpus / nv->nvectors; - -			/* Account for extra vectors to compensate rounding errors */ -			if (extra_vecs) { -				cpus_per_vec++; -				--extra_vecs; -			} - -			/* -			 * wrapping has to be considered given 'startvec' -			 * may start anywhere -			 */ -			if (curvec >= last_affv) -				curvec = firstvec; -			irq_spread_init_one(&masks[curvec].mask, nmsk, -						cpus_per_vec); -		} -		done += nv->nvectors; -	} -	kfree(node_vectors); -	return done; -} - -/* - * build affinity in two stages: - *	1) spread present CPU on these vectors - *	2) spread other possible CPUs on these vectors - */ -static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, -				    unsigned int firstvec, -				    struct irq_affinity_desc *masks) -{ -	unsigned int curvec = startvec, nr_present = 0, nr_others = 0; -	cpumask_var_t *node_to_cpumask; -	cpumask_var_t nmsk, npresmsk; -	int ret = -ENOMEM; - -	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) -		return ret; - -	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) -		goto fail_nmsk; - -	node_to_cpumask = alloc_node_to_cpumask(); -	if (!node_to_cpumask) -		goto fail_npresmsk; - -	/* Stabilize the cpumasks */ -	cpus_read_lock(); -	build_node_to_cpumask(node_to_cpumask); - -	/* Spread on present CPUs starting from affd->pre_vectors */ -	ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, -					 node_to_cpumask, cpu_present_mask, -					 nmsk, masks); -	if (ret < 0) -		goto fail_build_affinity; -	nr_present = ret; - -	/* -	 * Spread on non present CPUs starting from the next vector to be -	 * handled. If the spreading of present CPUs already exhausted the -	 * vector space, assign the non present CPUs to the already spread -	 * out vectors. -	 */ -	if (nr_present >= numvecs) -		curvec = firstvec; -	else -		curvec = firstvec + nr_present; -	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); -	ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, -					 node_to_cpumask, npresmsk, nmsk, -					 masks); -	if (ret >= 0) -		nr_others = ret; - - fail_build_affinity: -	cpus_read_unlock(); - -	if (ret >= 0) -		WARN_ON(nr_present + nr_others < numvecs); - -	free_node_to_cpumask(node_to_cpumask); - - fail_npresmsk: -	free_cpumask_var(npresmsk); - - fail_nmsk: -	free_cpumask_var(nmsk); -	return ret < 0 ? ret : 0; -} +#include <linux/group_cpus.h>  static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)  { @@ -461,14 +70,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)  	 */  	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {  		unsigned int this_vecs = affd->set_size[i]; -		int ret; +		int j; +		struct cpumask *result = group_cpus_evenly(this_vecs); -		ret = irq_build_affinity_masks(curvec, this_vecs, -					       curvec, masks); -		if (ret) { +		if (!result) {  			kfree(masks);  			return NULL;  		} + +		for (j = 0; j < this_vecs; j++) +			cpumask_copy(&masks[curvec + j].mask, &result[j]); +		kfree(result); +  		curvec += this_vecs;  		usedvecs += this_vecs;  	} diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c new file mode 100644 index 000000000000..fa4fc18c6131 --- /dev/null +++ b/kernel/irq/ipi-mux.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Multiplex several virtual IPIs over a single HW IPI. + * + * Copyright The Asahi Linux Contributors + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#define pr_fmt(fmt) "ipi-mux: " fmt +#include <linux/cpu.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/irqchip.h> +#include <linux/irqchip/chained_irq.h> +#include <linux/irqdomain.h> +#include <linux/jump_label.h> +#include <linux/percpu.h> +#include <linux/smp.h> + +struct ipi_mux_cpu { +	atomic_t			enable; +	atomic_t			bits; +}; + +static struct ipi_mux_cpu __percpu *ipi_mux_pcpu; +static struct irq_domain *ipi_mux_domain; +static void (*ipi_mux_send)(unsigned int cpu); + +static void ipi_mux_mask(struct irq_data *d) +{ +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); + +	atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable); +} + +static void ipi_mux_unmask(struct irq_data *d) +{ +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); +	u32 ibit = BIT(irqd_to_hwirq(d)); + +	atomic_or(ibit, &icpu->enable); + +	/* +	 * The atomic_or() above must complete before the atomic_read() +	 * below to avoid racing ipi_mux_send_mask(). +	 */ +	smp_mb__after_atomic(); + +	/* If a pending IPI was unmasked, raise a parent IPI immediately. */ +	if (atomic_read(&icpu->bits) & ibit) +		ipi_mux_send(smp_processor_id()); +} + +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask) +{ +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); +	u32 ibit = BIT(irqd_to_hwirq(d)); +	unsigned long pending; +	int cpu; + +	for_each_cpu(cpu, mask) { +		icpu = per_cpu_ptr(ipi_mux_pcpu, cpu); + +		/* +		 * This sequence is the mirror of the one in ipi_mux_unmask(); +		 * see the comment there. Additionally, release semantics +		 * ensure that the vIPI flag set is ordered after any shared +		 * memory accesses that precede it. This therefore also pairs +		 * with the atomic_fetch_andnot in ipi_mux_process(). +		 */ +		pending = atomic_fetch_or_release(ibit, &icpu->bits); + +		/* +		 * The atomic_fetch_or_release() above must complete +		 * before the atomic_read() below to avoid racing with +		 * ipi_mux_unmask(). +		 */ +		smp_mb__after_atomic(); + +		/* +		 * The flag writes must complete before the physical IPI is +		 * issued to another CPU. This is implied by the control +		 * dependency on the result of atomic_read() below, which is +		 * itself already ordered after the vIPI flag write. +		 */ +		if (!(pending & ibit) && (atomic_read(&icpu->enable) & ibit)) +			ipi_mux_send(cpu); +	} +} + +static const struct irq_chip ipi_mux_chip = { +	.name		= "IPI Mux", +	.irq_mask	= ipi_mux_mask, +	.irq_unmask	= ipi_mux_unmask, +	.ipi_send_mask	= ipi_mux_send_mask, +}; + +static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq, +				unsigned int nr_irqs, void *arg) +{ +	int i; + +	for (i = 0; i < nr_irqs; i++) { +		irq_set_percpu_devid(virq + i); +		irq_domain_set_info(d, virq + i, i, &ipi_mux_chip, NULL, +				    handle_percpu_devid_irq, NULL, NULL); +	} + +	return 0; +} + +static const struct irq_domain_ops ipi_mux_domain_ops = { +	.alloc		= ipi_mux_domain_alloc, +	.free		= irq_domain_free_irqs_top, +}; + +/** + * ipi_mux_process - Process multiplexed virtual IPIs + */ +void ipi_mux_process(void) +{ +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu); +	irq_hw_number_t hwirq; +	unsigned long ipis; +	unsigned int en; + +	/* +	 * Reading enable mask does not need to be ordered as long as +	 * this function is called from interrupt handler because only +	 * the CPU itself can change it's own enable mask. +	 */ +	en = atomic_read(&icpu->enable); + +	/* +	 * Clear the IPIs we are about to handle. This pairs with the +	 * atomic_fetch_or_release() in ipi_mux_send_mask(). +	 */ +	ipis = atomic_fetch_andnot(en, &icpu->bits) & en; + +	for_each_set_bit(hwirq, &ipis, BITS_PER_TYPE(int)) +		generic_handle_domain_irq(ipi_mux_domain, hwirq); +} + +/** + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single + * parent IPI. + * @nr_ipi:		number of virtual IPIs to create. This should + *			be <= BITS_PER_TYPE(int) + * @mux_send:		callback to trigger parent IPI for a particular CPU + * + * Returns first virq of the newly created virtual IPIs upon success + * or <=0 upon failure + */ +int ipi_mux_create(unsigned int nr_ipi, void (*mux_send)(unsigned int cpu)) +{ +	struct fwnode_handle *fwnode; +	struct irq_domain *domain; +	int rc; + +	if (ipi_mux_domain) +		return -EEXIST; + +	if (BITS_PER_TYPE(int) < nr_ipi || !mux_send) +		return -EINVAL; + +	ipi_mux_pcpu = alloc_percpu(typeof(*ipi_mux_pcpu)); +	if (!ipi_mux_pcpu) +		return -ENOMEM; + +	fwnode = irq_domain_alloc_named_fwnode("IPI-Mux"); +	if (!fwnode) { +		pr_err("unable to create IPI Mux fwnode\n"); +		rc = -ENOMEM; +		goto fail_free_cpu; +	} + +	domain = irq_domain_create_linear(fwnode, nr_ipi, +					  &ipi_mux_domain_ops, NULL); +	if (!domain) { +		pr_err("unable to add IPI Mux domain\n"); +		rc = -ENOMEM; +		goto fail_free_fwnode; +	} + +	domain->flags |= IRQ_DOMAIN_FLAG_IPI_SINGLE; +	irq_domain_update_bus_token(domain, DOMAIN_BUS_IPI); + +	rc = irq_domain_alloc_irqs(domain, nr_ipi, NUMA_NO_NODE, NULL); +	if (rc <= 0) { +		pr_err("unable to alloc IRQs from IPI Mux domain\n"); +		goto fail_free_domain; +	} + +	ipi_mux_domain = domain; +	ipi_mux_send = mux_send; + +	return rc; + +fail_free_domain: +	irq_domain_remove(domain); +fail_free_fwnode: +	irq_domain_free_fwnode(fwnode); +fail_free_cpu: +	free_percpu(ipi_mux_pcpu); +	return rc; +} diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index bbd945bacef0..961d4af76af3 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -188,9 +188,9 @@ EXPORT_SYMBOL_GPL(ipi_get_hwirq);  static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,  			   const struct cpumask *dest, unsigned int cpu)  { -	const struct cpumask *ipimask = irq_data_get_affinity_mask(data); +	const struct cpumask *ipimask; -	if (!chip || !ipimask) +	if (!chip || !data)  		return -EINVAL;  	if (!chip->ipi_send_single && !chip->ipi_send_mask) @@ -199,6 +199,10 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,  	if (cpu >= nr_cpu_ids)  		return -EINVAL; +	ipimask = irq_data_get_affinity_mask(data); +	if (!ipimask) +		return -EINVAL; +  	if (dest) {  		if (!cpumask_subset(dest, ipimask))  			return -EINVAL; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index fd0996274401..240e145e969f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -277,7 +277,7 @@ static struct attribute *irq_attrs[] = {  };  ATTRIBUTE_GROUPS(irq); -static struct kobj_type irq_kobj_type = { +static const struct kobj_type irq_kobj_type = {  	.release	= irq_kobj_release,  	.sysfs_ops	= &kobj_sysfs_ops,  	.default_groups = irq_groups, @@ -335,7 +335,7 @@ postcore_initcall(irq_sysfs_init);  #else /* !CONFIG_SYSFS */ -static struct kobj_type irq_kobj_type = { +static const struct kobj_type irq_kobj_type = {  	.release	= irq_kobj_release,  }; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8fe1da9614ee..f34760a1e222 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -25,6 +25,9 @@ static DEFINE_MUTEX(irq_domain_mutex);  static struct irq_domain *irq_default_domain; +static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, +					unsigned int nr_irqs, int node, void *arg, +					bool realloc, const struct irq_affinity_desc *affinity);  static void irq_domain_check_hierarchy(struct irq_domain *domain);  struct irqchip_fwid { @@ -114,7 +117,7 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)  {  	struct irqchip_fwid *fwid; -	if (WARN_ON(!is_fwnode_irqchip(fwnode))) +	if (!fwnode || WARN_ON(!is_fwnode_irqchip(fwnode)))  		return;  	fwid = container_of(fwnode, struct irqchip_fwid, fwnode); @@ -123,23 +126,12 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode)  }  EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); -/** - * __irq_domain_add() - Allocate a new irq_domain data structure - * @fwnode: firmware node for the interrupt controller - * @size: Size of linear map; 0 for radix mapping only - * @hwirq_max: Maximum number of interrupts supported by controller - * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no - *              direct mapping - * @ops: domain callbacks - * @host_data: Controller private data pointer - * - * Allocates and initializes an irq_domain structure. - * Returns pointer to IRQ domain, or NULL on failure. - */ -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, -				    irq_hw_number_t hwirq_max, int direct_max, -				    const struct irq_domain_ops *ops, -				    void *host_data) +static struct irq_domain *__irq_domain_create(struct fwnode_handle *fwnode, +					      unsigned int size, +					      irq_hw_number_t hwirq_max, +					      int direct_max, +					      const struct irq_domain_ops *ops, +					      void *host_data)  {  	struct irqchip_fwid *fwid;  	struct irq_domain *domain; @@ -214,25 +206,66 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int s  	/* Fill structure */  	INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); -	mutex_init(&domain->revmap_mutex);  	domain->ops = ops;  	domain->host_data = host_data;  	domain->hwirq_max = hwirq_max; -	if (direct_max) { +	if (direct_max)  		domain->flags |= IRQ_DOMAIN_FLAG_NO_MAP; -	}  	domain->revmap_size = size; +	/* +	 * Hierarchical domains use the domain lock of the root domain +	 * (innermost domain). +	 * +	 * For non-hierarchical domains (as for root domains), the root +	 * pointer is set to the domain itself so that &domain->root->mutex +	 * always points to the right lock. +	 */ +	mutex_init(&domain->mutex); +	domain->root = domain; +  	irq_domain_check_hierarchy(domain); +	return domain; +} + +static void __irq_domain_publish(struct irq_domain *domain) +{  	mutex_lock(&irq_domain_mutex);  	debugfs_add_domain_dir(domain);  	list_add(&domain->link, &irq_domain_list);  	mutex_unlock(&irq_domain_mutex);  	pr_debug("Added domain %s\n", domain->name); +} + +/** + * __irq_domain_add() - Allocate a new irq_domain data structure + * @fwnode: firmware node for the interrupt controller + * @size: Size of linear map; 0 for radix mapping only + * @hwirq_max: Maximum number of interrupts supported by controller + * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no + *              direct mapping + * @ops: domain callbacks + * @host_data: Controller private data pointer + * + * Allocates and initializes an irq_domain structure. + * Returns pointer to IRQ domain, or NULL on failure. + */ +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, +				    irq_hw_number_t hwirq_max, int direct_max, +				    const struct irq_domain_ops *ops, +				    void *host_data) +{ +	struct irq_domain *domain; + +	domain = __irq_domain_create(fwnode, size, hwirq_max, direct_max, +				     ops, host_data); +	if (domain) +		__irq_domain_publish(domain); +  	return domain;  }  EXPORT_SYMBOL_GPL(__irq_domain_add); @@ -437,31 +470,6 @@ struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,  EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);  /** - * irq_domain_check_msi_remap - Check whether all MSI irq domains implement - * IRQ remapping - * - * Return: false if any MSI irq domain does not support IRQ remapping, - * true otherwise (including if there is no MSI irq domain) - */ -bool irq_domain_check_msi_remap(void) -{ -	struct irq_domain *h; -	bool ret = true; - -	mutex_lock(&irq_domain_mutex); -	list_for_each_entry(h, &irq_domain_list, link) { -		if (irq_domain_is_msi(h) && -		    !irq_domain_hierarchical_is_msi_remap(h)) { -			ret = false; -			break; -		} -	} -	mutex_unlock(&irq_domain_mutex); -	return ret; -} -EXPORT_SYMBOL_GPL(irq_domain_check_msi_remap); - -/**   * irq_set_default_host() - Set a "default" irq domain   * @domain: default domain pointer   * @@ -502,30 +510,34 @@ static bool irq_domain_is_nomap(struct irq_domain *domain)  static void irq_domain_clear_mapping(struct irq_domain *domain,  				     irq_hw_number_t hwirq)  { +	lockdep_assert_held(&domain->root->mutex); +  	if (irq_domain_is_nomap(domain))  		return; -	mutex_lock(&domain->revmap_mutex);  	if (hwirq < domain->revmap_size)  		rcu_assign_pointer(domain->revmap[hwirq], NULL);  	else  		radix_tree_delete(&domain->revmap_tree, hwirq); -	mutex_unlock(&domain->revmap_mutex);  }  static void irq_domain_set_mapping(struct irq_domain *domain,  				   irq_hw_number_t hwirq,  				   struct irq_data *irq_data)  { +	/* +	 * This also makes sure that all domains point to the same root when +	 * called from irq_domain_insert_irq() for each domain in a hierarchy. +	 */ +	lockdep_assert_held(&domain->root->mutex); +  	if (irq_domain_is_nomap(domain))  		return; -	mutex_lock(&domain->revmap_mutex);  	if (hwirq < domain->revmap_size)  		rcu_assign_pointer(domain->revmap[hwirq], irq_data);  	else  		radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); -	mutex_unlock(&domain->revmap_mutex);  }  static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) @@ -538,6 +550,9 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)  		return;  	hwirq = irq_data->hwirq; + +	mutex_lock(&domain->root->mutex); +  	irq_set_status_flags(irq, IRQ_NOREQUEST);  	/* remove chip and handler */ @@ -557,10 +572,12 @@ static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)  	/* Clear reverse map for this hwirq */  	irq_domain_clear_mapping(domain, hwirq); + +	mutex_unlock(&domain->root->mutex);  } -int irq_domain_associate(struct irq_domain *domain, unsigned int virq, -			 irq_hw_number_t hwirq) +static int irq_domain_associate_locked(struct irq_domain *domain, unsigned int virq, +				       irq_hw_number_t hwirq)  {  	struct irq_data *irq_data = irq_get_irq_data(virq);  	int ret; @@ -573,7 +590,6 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,  	if (WARN(irq_data->domain, "error: virq%i is already associated", virq))  		return -EINVAL; -	mutex_lock(&irq_domain_mutex);  	irq_data->hwirq = hwirq;  	irq_data->domain = domain;  	if (domain->ops->map) { @@ -590,23 +606,29 @@ int irq_domain_associate(struct irq_domain *domain, unsigned int virq,  			}  			irq_data->domain = NULL;  			irq_data->hwirq = 0; -			mutex_unlock(&irq_domain_mutex);  			return ret;  		} - -		/* If not already assigned, give the domain the chip's name */ -		if (!domain->name && irq_data->chip) -			domain->name = irq_data->chip->name;  	}  	domain->mapcount++;  	irq_domain_set_mapping(domain, hwirq, irq_data); -	mutex_unlock(&irq_domain_mutex);  	irq_clear_status_flags(virq, IRQ_NOREQUEST);  	return 0;  } + +int irq_domain_associate(struct irq_domain *domain, unsigned int virq, +			 irq_hw_number_t hwirq) +{ +	int ret; + +	mutex_lock(&domain->root->mutex); +	ret = irq_domain_associate_locked(domain, virq, hwirq); +	mutex_unlock(&domain->root->mutex); + +	return ret; +}  EXPORT_SYMBOL_GPL(irq_domain_associate);  void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, @@ -619,9 +641,8 @@ void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,  	pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,  		of_node_full_name(of_node), irq_base, (int)hwirq_base, count); -	for (i = 0; i < count; i++) { +	for (i = 0; i < count; i++)  		irq_domain_associate(domain, irq_base + i, hwirq_base + i); -	}  }  EXPORT_SYMBOL_GPL(irq_domain_associate_many); @@ -668,6 +689,34 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)  EXPORT_SYMBOL_GPL(irq_create_direct_mapping);  #endif +static unsigned int irq_create_mapping_affinity_locked(struct irq_domain *domain, +						       irq_hw_number_t hwirq, +						       const struct irq_affinity_desc *affinity) +{ +	struct device_node *of_node = irq_domain_get_of_node(domain); +	int virq; + +	pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); + +	/* Allocate a virtual interrupt number */ +	virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), +				      affinity); +	if (virq <= 0) { +		pr_debug("-> virq allocation failed\n"); +		return 0; +	} + +	if (irq_domain_associate_locked(domain, virq, hwirq)) { +		irq_free_desc(virq); +		return 0; +	} + +	pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", +		hwirq, of_node_full_name(of_node), virq); + +	return virq; +} +  /**   * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space   * @domain: domain owning this hardware interrupt or NULL for default domain @@ -680,14 +729,11 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);   * on the number returned from that call.   */  unsigned int irq_create_mapping_affinity(struct irq_domain *domain, -				       irq_hw_number_t hwirq, -				       const struct irq_affinity_desc *affinity) +					 irq_hw_number_t hwirq, +					 const struct irq_affinity_desc *affinity)  { -	struct device_node *of_node;  	int virq; -	pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); -  	/* Look for default domain if necessary */  	if (domain == NULL)  		domain = irq_default_domain; @@ -695,32 +741,19 @@ unsigned int irq_create_mapping_affinity(struct irq_domain *domain,  		WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);  		return 0;  	} -	pr_debug("-> using domain @%p\n", domain); -	of_node = irq_domain_get_of_node(domain); +	mutex_lock(&domain->root->mutex);  	/* Check if mapping already exists */  	virq = irq_find_mapping(domain, hwirq);  	if (virq) { -		pr_debug("-> existing mapping on virq %d\n", virq); -		return virq; -	} - -	/* Allocate a virtual interrupt number */ -	virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), -				      affinity); -	if (virq <= 0) { -		pr_debug("-> virq allocation failed\n"); -		return 0; -	} - -	if (irq_domain_associate(domain, virq, hwirq)) { -		irq_free_desc(virq); -		return 0; +		pr_debug("existing mapping on virq %d\n", virq); +		goto out;  	} -	pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", -		hwirq, of_node_full_name(of_node), virq); +	virq = irq_create_mapping_affinity_locked(domain, hwirq, affinity); +out: +	mutex_unlock(&domain->root->mutex);  	return virq;  } @@ -789,6 +822,8 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)  	if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))  		type &= IRQ_TYPE_SENSE_MASK; +	mutex_lock(&domain->root->mutex); +  	/*  	 * If we've already configured this interrupt,  	 * don't do it again, or hell will break loose. @@ -801,7 +836,7 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)  		 * interrupt number.  		 */  		if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq)) -			return virq; +			goto out;  		/*  		 * If the trigger type has not been set yet, then set @@ -809,40 +844,45 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)  		 */  		if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {  			irq_data = irq_get_irq_data(virq); -			if (!irq_data) -				return 0; +			if (!irq_data) { +				virq = 0; +				goto out; +			}  			irqd_set_trigger_type(irq_data, type); -			return virq; +			goto out;  		}  		pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",  			hwirq, of_node_full_name(to_of_node(fwspec->fwnode))); -		return 0; +		virq = 0; +		goto out;  	}  	if (irq_domain_is_hierarchy(domain)) { -		virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); -		if (virq <= 0) -			return 0; +		virq = irq_domain_alloc_irqs_locked(domain, -1, 1, NUMA_NO_NODE, +						    fwspec, false, NULL); +		if (virq <= 0) { +			virq = 0; +			goto out; +		}  	} else {  		/* Create mapping */ -		virq = irq_create_mapping(domain, hwirq); +		virq = irq_create_mapping_affinity_locked(domain, hwirq, NULL);  		if (!virq) -			return virq; +			goto out;  	}  	irq_data = irq_get_irq_data(virq); -	if (!irq_data) { -		if (irq_domain_is_hierarchy(domain)) -			irq_domain_free_irqs(virq, 1); -		else -			irq_dispose_mapping(virq); -		return 0; +	if (WARN_ON(!irq_data)) { +		virq = 0; +		goto out;  	}  	/* Store trigger type */  	irqd_set_trigger_type(irq_data, type); +out: +	mutex_unlock(&domain->root->mutex);  	return virq;  } @@ -1102,12 +1142,17 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,  	struct irq_domain *domain;  	if (size) -		domain = irq_domain_create_linear(fwnode, size, ops, host_data); +		domain = __irq_domain_create(fwnode, size, size, 0, ops, host_data);  	else -		domain = irq_domain_create_tree(fwnode, ops, host_data); +		domain = __irq_domain_create(fwnode, 0, ~0, 0, ops, host_data); +  	if (domain) { +		if (parent) +			domain->root = parent->root;  		domain->parent = parent;  		domain->flags |= flags; + +		__irq_domain_publish(domain);  	}  	return domain; @@ -1123,10 +1168,6 @@ static void irq_domain_insert_irq(int virq)  		domain->mapcount++;  		irq_domain_set_mapping(domain, data->hwirq, data); - -		/* If not already assigned, give the domain the chip's name */ -		if (!domain->name && data->chip) -			domain->name = data->chip->name;  	}  	irq_clear_status_flags(virq, IRQ_NOREQUEST); @@ -1426,40 +1467,12 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,  	return domain->ops->alloc(domain, irq_base, nr_irqs, arg);  } -/** - * __irq_domain_alloc_irqs - Allocate IRQs from domain - * @domain:	domain to allocate from - * @irq_base:	allocate specified IRQ number if irq_base >= 0 - * @nr_irqs:	number of IRQs to allocate - * @node:	NUMA node id for memory allocation - * @arg:	domain specific argument - * @realloc:	IRQ descriptors have already been allocated if true - * @affinity:	Optional irq affinity mask for multiqueue devices - * - * Allocate IRQ numbers and initialized all data structures to support - * hierarchy IRQ domains. - * Parameter @realloc is mainly to support legacy IRQs. - * Returns error code or allocated IRQ number - * - * The whole process to setup an IRQ has been split into two steps. - * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ - * descriptor and required hardware resources. The second step, - * irq_domain_activate_irq(), is to program the hardware with preallocated - * resources. In this way, it's easier to rollback when failing to - * allocate resources. - */ -int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, -			    unsigned int nr_irqs, int node, void *arg, -			    bool realloc, const struct irq_affinity_desc *affinity) +static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, +					unsigned int nr_irqs, int node, void *arg, +					bool realloc, const struct irq_affinity_desc *affinity)  {  	int i, ret, virq; -	if (domain == NULL) { -		domain = irq_default_domain; -		if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) -			return -EINVAL; -	} -  	if (realloc && irq_base >= 0) {  		virq = irq_base;  	} else { @@ -1478,24 +1491,18 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,  		goto out_free_desc;  	} -	mutex_lock(&irq_domain_mutex);  	ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg); -	if (ret < 0) { -		mutex_unlock(&irq_domain_mutex); +	if (ret < 0)  		goto out_free_irq_data; -	}  	for (i = 0; i < nr_irqs; i++) {  		ret = irq_domain_trim_hierarchy(virq + i); -		if (ret) { -			mutex_unlock(&irq_domain_mutex); +		if (ret)  			goto out_free_irq_data; -		}  	} -	 +  	for (i = 0; i < nr_irqs; i++)  		irq_domain_insert_irq(virq + i); -	mutex_unlock(&irq_domain_mutex);  	return virq; @@ -1505,6 +1512,48 @@ out_free_desc:  	irq_free_descs(virq, nr_irqs);  	return ret;  } + +/** + * __irq_domain_alloc_irqs - Allocate IRQs from domain + * @domain:	domain to allocate from + * @irq_base:	allocate specified IRQ number if irq_base >= 0 + * @nr_irqs:	number of IRQs to allocate + * @node:	NUMA node id for memory allocation + * @arg:	domain specific argument + * @realloc:	IRQ descriptors have already been allocated if true + * @affinity:	Optional irq affinity mask for multiqueue devices + * + * Allocate IRQ numbers and initialized all data structures to support + * hierarchy IRQ domains. + * Parameter @realloc is mainly to support legacy IRQs. + * Returns error code or allocated IRQ number + * + * The whole process to setup an IRQ has been split into two steps. + * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ + * descriptor and required hardware resources. The second step, + * irq_domain_activate_irq(), is to program the hardware with preallocated + * resources. In this way, it's easier to rollback when failing to + * allocate resources. + */ +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, +			    unsigned int nr_irqs, int node, void *arg, +			    bool realloc, const struct irq_affinity_desc *affinity) +{ +	int ret; + +	if (domain == NULL) { +		domain = irq_default_domain; +		if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) +			return -EINVAL; +	} + +	mutex_lock(&domain->root->mutex); +	ret = irq_domain_alloc_irqs_locked(domain, irq_base, nr_irqs, node, arg, +					   realloc, affinity); +	mutex_unlock(&domain->root->mutex); + +	return ret; +}  EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);  /* The irq_data was moved, fix the revmap to refer to the new location */ @@ -1512,11 +1561,12 @@ static void irq_domain_fix_revmap(struct irq_data *d)  {  	void __rcu **slot; +	lockdep_assert_held(&d->domain->root->mutex); +  	if (irq_domain_is_nomap(d->domain))  		return;  	/* Fix up the revmap. */ -	mutex_lock(&d->domain->revmap_mutex);  	if (d->hwirq < d->domain->revmap_size) {  		/* Not using radix tree */  		rcu_assign_pointer(d->domain->revmap[d->hwirq], d); @@ -1525,7 +1575,6 @@ static void irq_domain_fix_revmap(struct irq_data *d)  		if (slot)  			radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);  	} -	mutex_unlock(&d->domain->revmap_mutex);  }  /** @@ -1541,8 +1590,8 @@ static void irq_domain_fix_revmap(struct irq_data *d)   */  int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)  { -	struct irq_data *child_irq_data; -	struct irq_data *root_irq_data = irq_get_irq_data(virq); +	struct irq_data *irq_data = irq_get_irq_data(virq); +	struct irq_data *parent_irq_data;  	struct irq_desc *desc;  	int rv = 0; @@ -1567,47 +1616,46 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg)  	if (WARN_ON(!irq_domain_is_hierarchy(domain)))  		return -EINVAL; -	if (!root_irq_data) +	if (!irq_data)  		return -EINVAL; -	if (domain->parent != root_irq_data->domain) +	if (domain->parent != irq_data->domain)  		return -EINVAL; -	child_irq_data = kzalloc_node(sizeof(*child_irq_data), GFP_KERNEL, -				      irq_data_get_node(root_irq_data)); -	if (!child_irq_data) +	parent_irq_data = kzalloc_node(sizeof(*parent_irq_data), GFP_KERNEL, +				       irq_data_get_node(irq_data)); +	if (!parent_irq_data)  		return -ENOMEM; -	mutex_lock(&irq_domain_mutex); +	mutex_lock(&domain->root->mutex);  	/* Copy the original irq_data. */ -	*child_irq_data = *root_irq_data; +	*parent_irq_data = *irq_data;  	/* -	 * Overwrite the root_irq_data, which is embedded in struct -	 * irq_desc, with values for this domain. +	 * Overwrite the irq_data, which is embedded in struct irq_desc, with +	 * values for this domain.  	 */ -	root_irq_data->parent_data = child_irq_data; -	root_irq_data->domain = domain; -	root_irq_data->mask = 0; -	root_irq_data->hwirq = 0; -	root_irq_data->chip = NULL; -	root_irq_data->chip_data = NULL; +	irq_data->parent_data = parent_irq_data; +	irq_data->domain = domain; +	irq_data->mask = 0; +	irq_data->hwirq = 0; +	irq_data->chip = NULL; +	irq_data->chip_data = NULL;  	/* May (probably does) set hwirq, chip, etc. */  	rv = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);  	if (rv) {  		/* Restore the original irq_data. */ -		*root_irq_data = *child_irq_data; -		kfree(child_irq_data); +		*irq_data = *parent_irq_data; +		kfree(parent_irq_data);  		goto error;  	} -	irq_domain_fix_revmap(child_irq_data); -	irq_domain_set_mapping(domain, root_irq_data->hwirq, root_irq_data); - +	irq_domain_fix_revmap(parent_irq_data); +	irq_domain_set_mapping(domain, irq_data->hwirq, irq_data);  error: -	mutex_unlock(&irq_domain_mutex); +	mutex_unlock(&domain->root->mutex);  	return rv;  } @@ -1623,8 +1671,8 @@ EXPORT_SYMBOL_GPL(irq_domain_push_irq);   */  int irq_domain_pop_irq(struct irq_domain *domain, int virq)  { -	struct irq_data *root_irq_data = irq_get_irq_data(virq); -	struct irq_data *child_irq_data; +	struct irq_data *irq_data = irq_get_irq_data(virq); +	struct irq_data *parent_irq_data;  	struct irq_data *tmp_irq_data;  	struct irq_desc *desc; @@ -1646,37 +1694,37 @@ int irq_domain_pop_irq(struct irq_domain *domain, int virq)  	if (domain == NULL)  		return -EINVAL; -	if (!root_irq_data) +	if (!irq_data)  		return -EINVAL;  	tmp_irq_data = irq_domain_get_irq_data(domain, virq);  	/* We can only "pop" if this domain is at the top of the list */ -	if (WARN_ON(root_irq_data != tmp_irq_data)) +	if (WARN_ON(irq_data != tmp_irq_data))  		return -EINVAL; -	if (WARN_ON(root_irq_data->domain != domain)) +	if (WARN_ON(irq_data->domain != domain))  		return -EINVAL; -	child_irq_data = root_irq_data->parent_data; -	if (WARN_ON(!child_irq_data)) +	parent_irq_data = irq_data->parent_data; +	if (WARN_ON(!parent_irq_data))  		return -EINVAL; -	mutex_lock(&irq_domain_mutex); +	mutex_lock(&domain->root->mutex); -	root_irq_data->parent_data = NULL; +	irq_data->parent_data = NULL; -	irq_domain_clear_mapping(domain, root_irq_data->hwirq); +	irq_domain_clear_mapping(domain, irq_data->hwirq);  	irq_domain_free_irqs_hierarchy(domain, virq, 1);  	/* Restore the original irq_data. */ -	*root_irq_data = *child_irq_data; +	*irq_data = *parent_irq_data; -	irq_domain_fix_revmap(root_irq_data); +	irq_domain_fix_revmap(irq_data); -	mutex_unlock(&irq_domain_mutex); +	mutex_unlock(&domain->root->mutex); -	kfree(child_irq_data); +	kfree(parent_irq_data);  	return 0;  } @@ -1690,17 +1738,20 @@ EXPORT_SYMBOL_GPL(irq_domain_pop_irq);  void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs)  {  	struct irq_data *data = irq_get_irq_data(virq); +	struct irq_domain *domain;  	int i;  	if (WARN(!data || !data->domain || !data->domain->ops->free,  		 "NULL pointer, cannot free irq\n"))  		return; -	mutex_lock(&irq_domain_mutex); +	domain = data->domain; + +	mutex_lock(&domain->root->mutex);  	for (i = 0; i < nr_irqs; i++)  		irq_domain_remove_irq(virq + i); -	irq_domain_free_irqs_hierarchy(data->domain, virq, nr_irqs); -	mutex_unlock(&irq_domain_mutex); +	irq_domain_free_irqs_hierarchy(domain, virq, nr_irqs); +	mutex_unlock(&domain->root->mutex);  	irq_domain_free_irq_data(virq, nr_irqs);  	irq_free_descs(virq, nr_irqs); @@ -1815,20 +1866,6 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain)  	if (domain->ops->alloc)  		domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY;  } - -/** - * irq_domain_hierarchical_is_msi_remap - Check if the domain or any - * parent has MSI remapping support - * @domain: domain pointer - */ -bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain) -{ -	for (; domain; domain = domain->parent) { -		if (irq_domain_is_msi_remap(domain)) -			return true; -	} -	return false; -}  #else	/* CONFIG_IRQ_DOMAIN_HIERARCHY */  /**   * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain @@ -1865,6 +1902,13 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,  	irq_set_handler_data(virq, handler_data);  } +static int irq_domain_alloc_irqs_locked(struct irq_domain *domain, int irq_base, +					unsigned int nr_irqs, int node, void *arg, +					bool realloc, const struct irq_affinity_desc *affinity) +{ +	return -EINVAL; +} +  static void irq_domain_check_hierarchy(struct irq_domain *domain)  {  } @@ -1915,7 +1959,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)  static void debugfs_remove_domain_dir(struct irq_domain *d)  { -	debugfs_remove(debugfs_lookup(d->name, domain_dir)); +	debugfs_lookup_and_remove(d->name, domain_dir);  }  void __init irq_domain_debugfs_init(struct dentry *root) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5b7cf28df290..8ce75495e04f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -723,10 +723,13 @@ EXPORT_SYMBOL(disable_irq_nosync);   *	to complete before returning. If you use this function while   *	holding a resource the IRQ handler may need you will deadlock.   * - *	This function may be called - with care - from IRQ context. + *	Can only be called from preemptible code as it might sleep when + *	an interrupt thread is associated to @irq. + *   */  void disable_irq(unsigned int irq)  { +	might_sleep();  	if (!__disable_irq_nosync(irq))  		synchronize_irq(irq);  } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 955267bbc2be..7a97bcb086bf 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -830,11 +830,8 @@ static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,  	domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,  					     fwnode, &msi_domain_ops, info); -	if (domain) { -		if (!domain->name && info->chip) -			domain->name = info->chip->name; +	if (domain)  		irq_domain_update_bus_token(domain, info->bus_token); -	}  	return domain;  } @@ -1000,7 +997,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,  fail:  	msi_unlock_descs(dev);  free_fwnode: -	kfree(fwnode); +	irq_domain_free_fwnode(fwnode);  free_bundle:  	kfree(bundle);  	return false; @@ -1013,6 +1010,7 @@ free_bundle:   */  void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)  { +	struct fwnode_handle *fwnode = NULL;  	struct msi_domain_info *info;  	struct irq_domain *domain; @@ -1025,7 +1023,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)  	dev->msi.data->__domains[domid].domain = NULL;  	info = domain->host_data; +	if (irq_domain_is_msi_device(domain)) +		fwnode = domain->fwnode;  	irq_domain_remove(domain); +	irq_domain_free_fwnode(fwnode);  	kfree(container_of(info, struct msi_domain_template, info));  unlock: @@ -1080,10 +1081,13 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,  	struct xarray *xa;  	int ret, virq; -	if (!msi_ctrl_valid(dev, &ctrl)) -		return -EINVAL; -  	msi_lock_descs(dev); + +	if (!msi_ctrl_valid(dev, &ctrl)) { +		ret = -EINVAL; +		goto unlock; +	} +  	ret = msi_domain_add_simple_msi_descs(dev, &ctrl);  	if (ret)  		goto unlock; @@ -1105,14 +1109,35 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,  	return 0;  fail: -	for (--virq; virq >= virq_base; virq--) +	for (--virq; virq >= virq_base; virq--) { +		msi_domain_depopulate_descs(dev, virq, 1);  		irq_domain_free_irqs_common(domain, virq, 1); +	}  	msi_domain_free_descs(dev, &ctrl);  unlock:  	msi_unlock_descs(dev);  	return ret;  } +void msi_domain_depopulate_descs(struct device *dev, int virq_base, int nvec) +{ +	struct msi_ctrl ctrl = { +		.domid	= MSI_DEFAULT_DOMAIN, +		.first  = virq_base, +		.last	= virq_base + nvec - 1, +	}; +	struct msi_desc *desc; +	struct xarray *xa; +	unsigned long idx; + +	if (!msi_ctrl_valid(dev, &ctrl)) +		return; + +	xa = &dev->msi.data->__domains[ctrl.domid].store; +	xa_for_each_range(xa, idx, desc, ctrl.first, ctrl.last) +		desc->irq = 0; +} +  /*   * Carefully check whether the device can use reservation mode. If   * reservation mode is enabled then the early activation will assign a @@ -1623,3 +1648,30 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)  {  	return (struct msi_domain_info *)domain->host_data;  } + +/** + * msi_device_has_isolated_msi - True if the device has isolated MSI + * @dev: The device to check + * + * Isolated MSI means that HW modeled by an irq_domain on the path from the + * initiating device to the CPU will validate that the MSI message specifies an + * interrupt number that the device is authorized to trigger. This must block + * devices from triggering interrupts they are not authorized to trigger. + * Currently authorization means the MSI vector is one assigned to the device. + * + * This is interesting for securing VFIO use cases where a rouge MSI (eg created + * by abusing a normal PCI MemWr DMA) must not allow the VFIO userspace to + * impact outside its security domain, eg userspace triggering interrupts on + * kernel drivers, a VM triggering interrupts on the hypervisor, or a VM + * triggering interrupts on another VM. + */ +bool msi_device_has_isolated_msi(struct device *dev) +{ +	struct irq_domain *domain = dev_get_msi_domain(dev); + +	for (; domain; domain = domain->parent) +		if (domain->flags & IRQ_DOMAIN_FLAG_ISOLATED_MSI) +			return true; +	return arch_is_isolated_msi(); +} +EXPORT_SYMBOL_GPL(msi_device_has_isolated_msi); diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index f35d9cc1aab1..bfbc12da3326 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -157,14 +157,11 @@ static void test_kallsyms_compression_ratio(void)  static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr)  {  	u64 t0, t1, t; -	unsigned long flags;  	struct test_stat *stat = (struct test_stat *)data; -	local_irq_save(flags); -	t0 = sched_clock(); +	t0 = ktime_get_ns();  	(void)kallsyms_lookup_name(name); -	t1 = sched_clock(); -	local_irq_restore(flags); +	t1 = ktime_get_ns();  	t = t1 - t0;  	if (t < stat->min) @@ -234,18 +231,15 @@ static int find_symbol(void *data, const char *name, struct module *mod, unsigne  static void test_perf_kallsyms_on_each_symbol(void)  {  	u64 t0, t1; -	unsigned long flags;  	struct test_stat stat;  	memset(&stat, 0, sizeof(stat));  	stat.max = INT_MAX;  	stat.name = stub_name;  	stat.perf = 1; -	local_irq_save(flags); -	t0 = sched_clock(); +	t0 = ktime_get_ns();  	kallsyms_on_each_symbol(find_symbol, &stat); -	t1 = sched_clock(); -	local_irq_restore(flags); +	t1 = ktime_get_ns();  	pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0);  } @@ -270,17 +264,14 @@ static int match_symbol(void *data, unsigned long addr)  static void test_perf_kallsyms_on_each_match_symbol(void)  {  	u64 t0, t1; -	unsigned long flags;  	struct test_stat stat;  	memset(&stat, 0, sizeof(stat));  	stat.max = INT_MAX;  	stat.name = stub_name; -	local_irq_save(flags); -	t0 = sched_clock(); +	t0 = ktime_get_ns();  	kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat); -	t1 = sched_clock(); -	local_irq_restore(flags); +	t1 = ktime_get_ns();  	pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0);  } diff --git a/kernel/kcov.c b/kernel/kcov.c index e5cd09fd8a05..84c717337df0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -489,7 +489,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)  		goto exit;  	}  	spin_unlock_irqrestore(&kcov->lock, flags); -	vma->vm_flags |= VM_DONTEXPAND; +	vm_flags_set(vma, VM_DONTEXPAND);  	for (off = 0; off < size; off += PAGE_SIZE) {  		page = vmalloc_to_page(kcov->area + off);  		res = vm_insert_page(vma, vma->vm_start + off, page); diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index dcec1b743c69..a60c561724be 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -159,7 +159,7 @@ static bool __report_matches(const struct expect_report *r)  	const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;  	bool ret = false;  	unsigned long flags; -	typeof(observed.lines) expect; +	typeof(*observed.lines) *expect;  	const char *end;  	char *cur;  	int i; @@ -168,6 +168,10 @@ static bool __report_matches(const struct expect_report *r)  	if (!report_available())  		return false; +	expect = kmalloc(sizeof(observed.lines), GFP_KERNEL); +	if (WARN_ON(!expect)) +		return false; +  	/* Generate expected report contents. */  	/* Title */ @@ -253,6 +257,7 @@ static bool __report_matches(const struct expect_report *r)  		strstr(observed.lines[2], expect[1])));  out:  	spin_unlock_irqrestore(&observed.lock, flags); +	kfree(expect);  	return ret;  } diff --git a/kernel/kexec.c b/kernel/kexec.c index cb8e6e6f983c..92d301f98776 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -190,10 +190,12 @@ out_unlock:  static inline int kexec_load_check(unsigned long nr_segments,  				   unsigned long flags)  { +	int image_type = (flags & KEXEC_ON_CRASH) ? +			 KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;  	int result;  	/* We only trust the superuser with rebooting the system. */ -	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) +	if (!kexec_load_permitted(image_type))  		return -EPERM;  	/* Permit LSMs and IMA to fail the kexec */ diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 969e8f52f7da..3d578c6fefee 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -6,6 +6,7 @@  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/btf.h>  #include <linux/capability.h>  #include <linux/mm.h>  #include <linux/file.h> @@ -920,10 +921,64 @@ int kimage_load_segment(struct kimage *image,  	return result;  } +struct kexec_load_limit { +	/* Mutex protects the limit count. */ +	struct mutex mutex; +	int limit; +}; + +static struct kexec_load_limit load_limit_reboot = { +	.mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex), +	.limit = -1, +}; + +static struct kexec_load_limit load_limit_panic = { +	.mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex), +	.limit = -1, +}; +  struct kimage *kexec_image;  struct kimage *kexec_crash_image; -int kexec_load_disabled; +static int kexec_load_disabled; +  #ifdef CONFIG_SYSCTL +static int kexec_limit_handler(struct ctl_table *table, int write, +			       void *buffer, size_t *lenp, loff_t *ppos) +{ +	struct kexec_load_limit *limit = table->data; +	int val; +	struct ctl_table tmp = { +		.data = &val, +		.maxlen = sizeof(val), +		.mode = table->mode, +	}; +	int ret; + +	if (write) { +		ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); +		if (ret) +			return ret; + +		if (val < 0) +			return -EINVAL; + +		mutex_lock(&limit->mutex); +		if (limit->limit != -1 && val >= limit->limit) +			ret = -EINVAL; +		else +			limit->limit = val; +		mutex_unlock(&limit->mutex); + +		return ret; +	} + +	mutex_lock(&limit->mutex); +	val = limit->limit; +	mutex_unlock(&limit->mutex); + +	return proc_dointvec(&tmp, write, buffer, lenp, ppos); +} +  static struct ctl_table kexec_core_sysctls[] = {  	{  		.procname	= "kexec_load_disabled", @@ -935,6 +990,18 @@ static struct ctl_table kexec_core_sysctls[] = {  		.extra1		= SYSCTL_ONE,  		.extra2		= SYSCTL_ONE,  	}, +	{ +		.procname	= "kexec_load_limit_panic", +		.data		= &load_limit_panic, +		.mode		= 0644, +		.proc_handler	= kexec_limit_handler, +	}, +	{ +		.procname	= "kexec_load_limit_reboot", +		.data		= &load_limit_reboot, +		.mode		= 0644, +		.proc_handler	= kexec_limit_handler, +	},  	{ }  }; @@ -946,6 +1013,32 @@ static int __init kexec_core_sysctl_init(void)  late_initcall(kexec_core_sysctl_init);  #endif +bool kexec_load_permitted(int kexec_image_type) +{ +	struct kexec_load_limit *limit; + +	/* +	 * Only the superuser can use the kexec syscall and if it has not +	 * been disabled. +	 */ +	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) +		return false; + +	/* Check limit counter and decrease it.*/ +	limit = (kexec_image_type == KEXEC_TYPE_CRASH) ? +		&load_limit_panic : &load_limit_reboot; +	mutex_lock(&limit->mutex); +	if (!limit->limit) { +		mutex_unlock(&limit->mutex); +		return false; +	} +	if (limit->limit != -1) +		limit->limit--; +	mutex_unlock(&limit->mutex); + +	return true; +} +  /*   * No panic_cpu check version of crash_kexec().  This function is called   * only when panic_cpu holds the current CPU number; this is the only CPU @@ -975,7 +1068,7 @@ void __noclone __crash_kexec(struct pt_regs *regs)  }  STACK_FRAME_NON_STANDARD(__crash_kexec); -void crash_kexec(struct pt_regs *regs) +__bpf_kfunc void crash_kexec(struct pt_regs *regs)  {  	int old_cpu, this_cpu; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index dd5983010b7b..f1a0e4e3fb5c 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -326,11 +326,13 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,  		unsigned long, cmdline_len, const char __user *, cmdline_ptr,  		unsigned long, flags)  { -	int ret = 0, i; +	int image_type = (flags & KEXEC_FILE_ON_CRASH) ? +			 KEXEC_TYPE_CRASH : KEXEC_TYPE_DEFAULT;  	struct kimage **dest_image, *image; +	int ret = 0, i;  	/* We only trust the superuser with rebooting the system. */ -	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) +	if (!kexec_load_permitted(image_type))  		return -EPERM;  	/* Make sure we have a legal set of flags */ @@ -342,11 +344,12 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,  	if (!kexec_trylock())  		return -EBUSY; -	dest_image = &kexec_image; -	if (flags & KEXEC_FILE_ON_CRASH) { +	if (image_type == KEXEC_TYPE_CRASH) {  		dest_image = &kexec_crash_image;  		if (kexec_crash_image)  			arch_kexec_unprotect_crashkres(); +	} else { +		dest_image = &kexec_image;  	}  	if (flags & KEXEC_FILE_UNLOAD) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1c18ecf9f98b..00e177de91cc 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -458,7 +458,7 @@ static inline int kprobe_optready(struct kprobe *p)  }  /* Return true if the kprobe is disarmed. Note: p must be on hash list */ -static inline bool kprobe_disarmed(struct kprobe *p) +bool kprobe_disarmed(struct kprobe *p)  {  	struct optimized_kprobe *op; @@ -555,17 +555,15 @@ static void do_unoptimize_kprobes(void)  	/* See comment in do_optimize_kprobes() */  	lockdep_assert_cpus_held(); -	/* Unoptimization must be done anytime */ -	if (list_empty(&unoptimizing_list)) -		return; +	if (!list_empty(&unoptimizing_list)) +		arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); -	arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); -	/* Loop on 'freeing_list' for disarming */ +	/* Loop on 'freeing_list' for disarming and removing from kprobe hash list */  	list_for_each_entry_safe(op, tmp, &freeing_list, list) {  		/* Switching from detour code to origin */  		op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; -		/* Disarm probes if marked disabled */ -		if (kprobe_disabled(&op->kp)) +		/* Disarm probes if marked disabled and not gone */ +		if (kprobe_disabled(&op->kp) && !kprobe_gone(&op->kp))  			arch_disarm_kprobe(&op->kp);  		if (kprobe_unused(&op->kp)) {  			/* @@ -662,7 +660,7 @@ void wait_for_kprobe_optimizer(void)  	mutex_unlock(&kprobe_mutex);  } -static bool optprobe_queued_unopt(struct optimized_kprobe *op) +bool optprobe_queued_unopt(struct optimized_kprobe *op)  {  	struct optimized_kprobe *_op; @@ -797,14 +795,13 @@ static void kill_optimized_kprobe(struct kprobe *p)  	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;  	if (kprobe_unused(p)) { -		/* Enqueue if it is unused */ -		list_add(&op->list, &freeing_list);  		/* -		 * Remove unused probes from the hash list. After waiting -		 * for synchronization, this probe is reclaimed. -		 * (reclaiming is done by do_free_cleaned_kprobes().) +		 * Unused kprobe is on unoptimizing or freeing list. We move it +		 * to freeing_list and let the kprobe_optimizer() remove it from +		 * the kprobe hash list and free it.  		 */ -		hlist_del_rcu(&op->kp.hlist); +		if (optprobe_queued_unopt(op)) +			list_move(&op->list, &freeing_list);  	}  	/* Don't touch the code, because it is already freed. */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 2df00b789b90..0408aab80941 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -51,6 +51,14 @@ static ssize_t cpu_byteorder_show(struct kobject *kobj,  }  KERNEL_ATTR_RO(cpu_byteorder); +/* address bits */ +static ssize_t address_bits_show(struct kobject *kobj, +				 struct kobj_attribute *attr, char *buf) +{ +	return sysfs_emit(buf, "%zu\n", sizeof(void *) * 8 /* CHAR_BIT */); +} +KERNEL_ATTR_RO(address_bits); +  #ifdef CONFIG_UEVENT_HELPER  /* uevent helper program, used during early boot */  static ssize_t uevent_helper_show(struct kobject *kobj, @@ -233,6 +241,7 @@ static struct attribute * kernel_attrs[] = {  	&fscaps_attr.attr,  	&uevent_seqnum_attr.attr,  	&cpu_byteorder_attr.attr, +	&address_bits_attr.attr,  #ifdef CONFIG_UEVENT_HELPER  	&uevent_helper_attr.attr,  #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index f97fd01a2932..7e6751b29101 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1382,6 +1382,10 @@ EXPORT_SYMBOL_GPL(kthread_flush_worker);   * Flush and destroy @worker.  The simple flush is enough because the kthread   * worker API is used only in trivial scenarios.  There are no multi-step state   * machines needed. + * + * Note that this function is not responsible for handling delayed work, so + * caller should be responsible for queuing or canceling all delayed work items + * before invoke this function.   */  void kthread_destroy_worker(struct kthread_worker *worker)  { @@ -1393,6 +1397,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)  	kthread_flush_worker(worker);  	kthread_stop(task); +	WARN_ON(!list_empty(&worker->delayed_work_list));  	WARN_ON(!list_empty(&worker->work_list));  	kfree(worker);  } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 201f0c0482fb..4bd2d5e10f20 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -118,7 +118,6 @@ static struct klp_object *klp_find_object(struct klp_patch *patch,  }  struct klp_find_arg { -	const char *objname;  	const char *name;  	unsigned long addr;  	unsigned long count; @@ -148,15 +147,9 @@ static int klp_find_callback(void *data, const char *name,  {  	struct klp_find_arg *args = data; -	if ((mod && !args->objname) || (!mod && args->objname)) -		return 0; -  	if (strcmp(args->name, name))  		return 0; -	if (args->objname && strcmp(args->objname, mod->name)) -		return 0; -  	return klp_match_callback(data, addr);  } @@ -164,7 +157,6 @@ static int klp_find_object_symbol(const char *objname, const char *name,  				  unsigned long sympos, unsigned long *addr)  {  	struct klp_find_arg args = { -		.objname = objname,  		.name = name,  		.addr = 0,  		.count = 0, @@ -172,7 +164,7 @@ static int klp_find_object_symbol(const char *objname, const char *name,  	};  	if (objname) -		module_kallsyms_on_each_symbol(klp_find_callback, &args); +		module_kallsyms_on_each_symbol(objname, klp_find_callback, &args);  	else  		kallsyms_on_each_match_symbol(klp_match_callback, name, &args); @@ -268,6 +260,14 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,  	return 0;  } +void __weak clear_relocate_add(Elf_Shdr *sechdrs, +		   const char *strtab, +		   unsigned int symindex, +		   unsigned int relsec, +		   struct module *me) +{ +} +  /*   * At a high-level, there are two types of klp relocation sections: those which   * reference symbols which live in vmlinux; and those which reference symbols @@ -291,10 +291,10 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,   *    the to-be-patched module to be loaded and patched sometime *after* the   *    klp module is loaded.   */ -int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, -			     const char *shstrtab, const char *strtab, -			     unsigned int symndx, unsigned int secndx, -			     const char *objname) +static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, +				    const char *shstrtab, const char *strtab, +				    unsigned int symndx, unsigned int secndx, +				    const char *objname, bool apply)  {  	int cnt, ret;  	char sec_objname[MODULE_NAME_LEN]; @@ -316,11 +316,26 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,  	if (strcmp(objname ? objname : "vmlinux", sec_objname))  		return 0; -	ret = klp_resolve_symbols(sechdrs, strtab, symndx, sec, sec_objname); -	if (ret) -		return ret; +	if (apply) { +		ret = klp_resolve_symbols(sechdrs, strtab, symndx, +					  sec, sec_objname); +		if (ret) +			return ret; + +		return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod); +	} + +	clear_relocate_add(sechdrs, strtab, symndx, secndx, pmod); +	return 0; +} -	return apply_relocate_add(sechdrs, strtab, symndx, secndx, pmod); +int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, +			     const char *shstrtab, const char *strtab, +			     unsigned int symndx, unsigned int secndx, +			     const char *objname) +{ +	return klp_write_section_relocs(pmod, sechdrs, shstrtab, strtab, symndx, +					secndx, objname, true);  }  /* @@ -769,8 +784,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)  			   func->old_sympos ? func->old_sympos : 1);  } -static int klp_apply_object_relocs(struct klp_patch *patch, -				   struct klp_object *obj) +static int klp_write_object_relocs(struct klp_patch *patch, +				   struct klp_object *obj, +				   bool apply)  {  	int i, ret;  	struct klp_modinfo *info = patch->mod->klp_info; @@ -781,10 +797,10 @@ static int klp_apply_object_relocs(struct klp_patch *patch,  		if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))  			continue; -		ret = klp_apply_section_relocs(patch->mod, info->sechdrs, +		ret = klp_write_section_relocs(patch->mod, info->sechdrs,  					       info->secstrings,  					       patch->mod->core_kallsyms.strtab, -					       info->symndx, i, obj->name); +					       info->symndx, i, obj->name, apply);  		if (ret)  			return ret;  	} @@ -792,6 +808,18 @@ static int klp_apply_object_relocs(struct klp_patch *patch,  	return 0;  } +static int klp_apply_object_relocs(struct klp_patch *patch, +				   struct klp_object *obj) +{ +	return klp_write_object_relocs(patch, obj, true); +} + +static void klp_clear_object_relocs(struct klp_patch *patch, +				    struct klp_object *obj) +{ +	klp_write_object_relocs(patch, obj, false); +} +  /* parts of the initialization that is done only when the object is loaded */  static int klp_init_object_loaded(struct klp_patch *patch,  				  struct klp_object *obj) @@ -1179,7 +1207,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod,  			klp_unpatch_object(obj);  			klp_post_unpatch_callback(obj); - +			klp_clear_object_relocs(patch, obj);  			klp_free_object_loaded(obj);  			break;  		} diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e3375bc40dad..50d4863974e7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,6 +55,7 @@  #include <linux/rcupdate.h>  #include <linux/kprobes.h>  #include <linux/lockdep.h> +#include <linux/context_tracking.h>  #include <asm/sections.h> @@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)  {  	struct task_struct *curr = current;  	int dl = READ_ONCE(debug_locks); +	bool rcu = warn_rcu_enter();  	/* Note: the following can be executed concurrently, so be careful. */  	pr_warn("\n"); @@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)  	lockdep_print_held_locks(curr);  	pr_warn("\nstack backtrace:\n");  	dump_stack(); +	warn_rcu_exit(rcu);  }  EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 9c2fb613a55d..f04b1978899d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -46,6 +46,9 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");  torture_param(int, stat_interval, 60,  	     "Number of seconds between stats printk()s");  torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); +torture_param(int, rt_boost, 2, +		"Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); +torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");  torture_param(int, verbose, 1,  	     "Enable verbose debugging printk()s"); @@ -127,15 +130,50 @@ static void torture_lock_busted_write_unlock(int tid __maybe_unused)  	  /* BUGGY, do not use in real life!!! */  } -static void torture_boost_dummy(struct torture_random_state *trsp) +static void __torture_rt_boost(struct torture_random_state *trsp)  { -	/* Only rtmutexes care about priority */ +	const unsigned int factor = rt_boost_factor; + +	if (!rt_task(current)) { +		/* +		 * Boost priority once every rt_boost_factor operations. When +		 * the task tries to take the lock, the rtmutex it will account +		 * for the new priority, and do any corresponding pi-dance. +		 */ +		if (trsp && !(torture_random(trsp) % +			      (cxt.nrealwriters_stress * factor))) { +			sched_set_fifo(current); +		} else /* common case, do nothing */ +			return; +	} else { +		/* +		 * The task will remain boosted for another 10 * rt_boost_factor +		 * operations, then restored back to its original prio, and so +		 * forth. +		 * +		 * When @trsp is nil, we want to force-reset the task for +		 * stopping the kthread. +		 */ +		if (!trsp || !(torture_random(trsp) % +			       (cxt.nrealwriters_stress * factor * 2))) { +			sched_set_normal(current, 0); +		} else /* common case, do nothing */ +			return; +	} +} + +static void torture_rt_boost(struct torture_random_state *trsp) +{ +	if (rt_boost != 2) +		return; + +	__torture_rt_boost(trsp);  }  static struct lock_torture_ops lock_busted_ops = {  	.writelock	= torture_lock_busted_write_lock,  	.write_delay	= torture_lock_busted_write_delay, -	.task_boost     = torture_boost_dummy, +	.task_boost     = torture_rt_boost,  	.writeunlock	= torture_lock_busted_write_unlock,  	.readlock       = NULL,  	.read_delay     = NULL, @@ -179,7 +217,7 @@ __releases(torture_spinlock)  static struct lock_torture_ops spin_lock_ops = {  	.writelock	= torture_spin_lock_write_lock,  	.write_delay	= torture_spin_lock_write_delay, -	.task_boost     = torture_boost_dummy, +	.task_boost     = torture_rt_boost,  	.writeunlock	= torture_spin_lock_write_unlock,  	.readlock       = NULL,  	.read_delay     = NULL, @@ -206,7 +244,7 @@ __releases(torture_spinlock)  static struct lock_torture_ops spin_lock_irq_ops = {  	.writelock	= torture_spin_lock_write_lock_irq,  	.write_delay	= torture_spin_lock_write_delay, -	.task_boost     = torture_boost_dummy, +	.task_boost     = torture_rt_boost,  	.writeunlock	= torture_lock_spin_write_unlock_irq,  	.readlock       = NULL,  	.read_delay     = NULL, @@ -275,7 +313,7 @@ __releases(torture_rwlock)  static struct lock_torture_ops rw_lock_ops = {  	.writelock	= torture_rwlock_write_lock,  	.write_delay	= torture_rwlock_write_delay, -	.task_boost     = torture_boost_dummy, +	.task_boost     = torture_rt_boost,  	.writeunlock	= torture_rwlock_write_unlock,  	.readlock       = torture_rwlock_read_lock,  	.read_delay     = torture_rwlock_read_delay, @@ -318,7 +356,7 @@ __releases(torture_rwlock)  static struct lock_torture_ops rw_lock_irq_ops = {  	.writelock	= torture_rwlock_write_lock_irq,  	.write_delay	= torture_rwlock_write_delay, -	.task_boost     = torture_boost_dummy, +	.task_boost     = torture_rt_boost,  	.writeunlock	= torture_rwlock_write_unlock_irq,  	.readlock       = torture_rwlock_read_lock_irq,  	.read_delay     = torture_rwlock_read_delay, @@ -358,7 +396,7 @@ __releases(torture_mutex)  static struct lock_torture_ops mutex_lock_ops = {  	.writelock	= torture_mutex_lock,  	.write_delay	= torture_mutex_delay, -	.task_boost     = torture_boost_dummy, +	.task_boost     = torture_rt_boost,  	.writeunlock	= torture_mutex_unlock,  	.readlock       = NULL,  	.read_delay     = NULL, @@ -456,7 +494,7 @@ static struct lock_torture_ops ww_mutex_lock_ops = {  	.exit		= torture_ww_mutex_exit,  	.writelock	= torture_ww_mutex_lock,  	.write_delay	= torture_mutex_delay, -	.task_boost     = torture_boost_dummy, +	.task_boost     = torture_rt_boost,  	.writeunlock	= torture_ww_mutex_unlock,  	.readlock       = NULL,  	.read_delay     = NULL, @@ -474,37 +512,6 @@ __acquires(torture_rtmutex)  	return 0;  } -static void torture_rtmutex_boost(struct torture_random_state *trsp) -{ -	const unsigned int factor = 50000; /* yes, quite arbitrary */ - -	if (!rt_task(current)) { -		/* -		 * Boost priority once every ~50k operations. When the -		 * task tries to take the lock, the rtmutex it will account -		 * for the new priority, and do any corresponding pi-dance. -		 */ -		if (trsp && !(torture_random(trsp) % -			      (cxt.nrealwriters_stress * factor))) { -			sched_set_fifo(current); -		} else /* common case, do nothing */ -			return; -	} else { -		/* -		 * The task will remain boosted for another ~500k operations, -		 * then restored back to its original prio, and so forth. -		 * -		 * When @trsp is nil, we want to force-reset the task for -		 * stopping the kthread. -		 */ -		if (!trsp || !(torture_random(trsp) % -			       (cxt.nrealwriters_stress * factor * 2))) { -			sched_set_normal(current, 0); -		} else /* common case, do nothing */ -			return; -	} -} -  static void torture_rtmutex_delay(struct torture_random_state *trsp)  {  	const unsigned long shortdelay_us = 2; @@ -530,10 +537,18 @@ __releases(torture_rtmutex)  	rt_mutex_unlock(&torture_rtmutex);  } +static void torture_rt_boost_rtmutex(struct torture_random_state *trsp) +{ +	if (!rt_boost) +		return; + +	__torture_rt_boost(trsp); +} +  static struct lock_torture_ops rtmutex_lock_ops = {  	.writelock	= torture_rtmutex_lock,  	.write_delay	= torture_rtmutex_delay, -	.task_boost     = torture_rtmutex_boost, +	.task_boost     = torture_rt_boost_rtmutex,  	.writeunlock	= torture_rtmutex_unlock,  	.readlock       = NULL,  	.read_delay     = NULL, @@ -600,7 +615,7 @@ __releases(torture_rwsem)  static struct lock_torture_ops rwsem_lock_ops = {  	.writelock	= torture_rwsem_down_write,  	.write_delay	= torture_rwsem_write_delay, -	.task_boost     = torture_boost_dummy, +	.task_boost     = torture_rt_boost,  	.writeunlock	= torture_rwsem_up_write,  	.readlock       = torture_rwsem_down_read,  	.read_delay     = torture_rwsem_read_delay, @@ -652,7 +667,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {  	.exit		= torture_percpu_rwsem_exit,  	.writelock	= torture_percpu_rwsem_down_write,  	.write_delay	= torture_rwsem_write_delay, -	.task_boost     = torture_boost_dummy, +	.task_boost     = torture_rt_boost,  	.writeunlock	= torture_percpu_rwsem_up_write,  	.readlock       = torture_percpu_rwsem_down_read,  	.read_delay     = torture_rwsem_read_delay, diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 2b23378775fe..ebe6b8ec7cb3 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -371,7 +371,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)  	/*  	 * We're pending, wait for the owner to go away.  	 * -	 * 0,1,1 -> 0,1,0 +	 * 0,1,1 -> *,1,0  	 *  	 * this wait loop must be a load-acquire such that we match the  	 * store-release that clears the locked bit and create lock @@ -380,7 +380,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)  	 * barriers.  	 */  	if (val & _Q_LOCKED_MASK) -		atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK)); +		smp_cond_load_acquire(&lock->locked, !VAL);  	/*  	 * take ownership and clear the pending bit. diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 010cf4e6d0b8..728f434de2bb 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -901,8 +901,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,  		 * then we need to wake the new top waiter up to try  		 * to get the lock.  		 */ -		if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) -			wake_up_state(waiter->task, waiter->wake_state); +		top_waiter = rt_mutex_top_waiter(lock); +		if (prerequeue_top_waiter != top_waiter) +			wake_up_state(top_waiter->task, top_waiter->wake_state);  		raw_spin_unlock_irq(&lock->wait_lock);  		return 0;  	} diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 44873594de03..acb5a50309a1 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -256,16 +256,13 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)  static inline bool rwsem_write_trylock(struct rw_semaphore *sem)  {  	long tmp = RWSEM_UNLOCKED_VALUE; -	bool ret = false; -	preempt_disable();  	if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {  		rwsem_set_owner(sem); -		ret = true; +		return true;  	} -	preempt_enable(); -	return ret; +	return false;  }  /* @@ -624,18 +621,16 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,  			 */  			if (first->handoff_set && (waiter != first))  				return false; - -			/* -			 * First waiter can inherit a previously set handoff -			 * bit and spin on rwsem if lock acquisition fails. -			 */ -			if (waiter == first) -				waiter->handoff_set = true;  		}  		new = count;  		if (count & RWSEM_LOCK_MASK) { +			/* +			 * A waiter (first or not) can set the handoff bit +			 * if it is an RT task or wait in the wait queue +			 * for too long. +			 */  			if (has_handoff || (!rt_task(waiter->task) &&  					    !time_after(jiffies, waiter->timeout)))  				return false; @@ -651,11 +646,12 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,  	} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));  	/* -	 * We have either acquired the lock with handoff bit cleared or -	 * set the handoff bit. +	 * We have either acquired the lock with handoff bit cleared or set +	 * the handoff bit. Only the first waiter can have its handoff_set +	 * set here to enable optimistic spinning in slowpath loop.  	 */  	if (new & RWSEM_FLAG_HANDOFF) { -		waiter->handoff_set = true; +		first->handoff_set = true;  		lockevent_inc(rwsem_wlock_handoff);  		return false;  	} @@ -717,7 +713,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)  		return false;  	} -	preempt_disable();  	/*  	 * Disable preemption is equal to the RCU read-side crital section,  	 * thus the task_strcut structure won't go away. @@ -729,7 +724,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)  	if ((flags & RWSEM_NONSPINNABLE) ||  	    (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))  		ret = false; -	preempt_enable();  	lockevent_cond_inc(rwsem_opt_fail, !ret);  	return ret; @@ -829,8 +823,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)  	int loop = 0;  	u64 rspin_threshold = 0; -	preempt_disable(); -  	/* sem->wait_lock should not be held when doing optimistic spinning */  	if (!osq_lock(&sem->osq))  		goto done; @@ -938,7 +930,6 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)  	}  	osq_unlock(&sem->osq);  done: -	preempt_enable();  	lockevent_cond_inc(rwsem_opt_fail, !taken);  	return taken;  } @@ -1092,7 +1083,7 @@ queue:  			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */  			break;  		} -		schedule(); +		schedule_preempt_disabled();  		lockevent_inc(rwsem_sleep_reader);  	} @@ -1179,15 +1170,12 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)  		if (waiter.handoff_set) {  			enum owner_state owner_state; -			preempt_disable();  			owner_state = rwsem_spin_on_owner(sem); -			preempt_enable(); -  			if (owner_state == OWNER_NULL)  				goto trylock_again;  		} -		schedule(); +		schedule_preempt_disabled();  		lockevent_inc(rwsem_sleep_writer);  		set_current_state(state);  trylock_again: @@ -1254,14 +1242,20 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)   */  static inline int __down_read_common(struct rw_semaphore *sem, int state)  { +	int ret = 0;  	long count; +	preempt_disable();  	if (!rwsem_read_trylock(sem, &count)) { -		if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) -			return -EINTR; +		if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) { +			ret = -EINTR; +			goto out; +		}  		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);  	} -	return 0; +out: +	preempt_enable(); +	return ret;  }  static inline void __down_read(struct rw_semaphore *sem) @@ -1281,19 +1275,23 @@ static inline int __down_read_killable(struct rw_semaphore *sem)  static inline int __down_read_trylock(struct rw_semaphore *sem)  { +	int ret = 0;  	long tmp;  	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); +	preempt_disable();  	tmp = atomic_long_read(&sem->count);  	while (!(tmp & RWSEM_READ_FAILED_MASK)) {  		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,  						    tmp + RWSEM_READER_BIAS)) {  			rwsem_set_reader_owned(sem); -			return 1; +			ret = 1; +			break;  		}  	} -	return 0; +	preempt_enable(); +	return ret;  }  /* @@ -1301,12 +1299,15 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)   */  static inline int __down_write_common(struct rw_semaphore *sem, int state)  { +	int ret = 0; + +	preempt_disable();  	if (unlikely(!rwsem_write_trylock(sem))) {  		if (IS_ERR(rwsem_down_write_slowpath(sem, state))) -			return -EINTR; +			ret = -EINTR;  	} - -	return 0; +	preempt_enable(); +	return ret;  }  static inline void __down_write(struct rw_semaphore *sem) @@ -1321,8 +1322,14 @@ static inline int __down_write_killable(struct rw_semaphore *sem)  static inline int __down_write_trylock(struct rw_semaphore *sem)  { +	int ret; + +	preempt_disable();  	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); -	return rwsem_write_trylock(sem); +	ret = rwsem_write_trylock(sem); +	preempt_enable(); + +	return ret;  }  /* @@ -1335,6 +1342,7 @@ static inline void __up_read(struct rw_semaphore *sem)  	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);  	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); +	preempt_disable();  	rwsem_clear_reader_owned(sem);  	tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);  	DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); @@ -1343,6 +1351,7 @@ static inline void __up_read(struct rw_semaphore *sem)  		clear_nonspinnable(sem);  		rwsem_wake(sem);  	} +	preempt_enable();  }  /* @@ -1363,9 +1372,9 @@ static inline void __up_write(struct rw_semaphore *sem)  	preempt_disable();  	rwsem_clear_owner(sem);  	tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); -	preempt_enable();  	if (unlikely(tmp & RWSEM_FLAG_WAITERS))  		rwsem_wake(sem); +	preempt_enable();  }  /* @@ -1383,11 +1392,13 @@ static inline void __downgrade_write(struct rw_semaphore *sem)  	 * write side. As such, rely on RELEASE semantics.  	 */  	DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); +	preempt_disable();  	tmp = atomic_long_fetch_add_release(  		-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);  	rwsem_set_reader_owned(sem);  	if (tmp & RWSEM_FLAG_WAITERS)  		rwsem_downgrade_wake(sem); +	preempt_enable();  }  #else /* !CONFIG_PREEMPT_RT */ @@ -1662,6 +1673,12 @@ void down_read_non_owner(struct rw_semaphore *sem)  {  	might_sleep();  	__down_read(sem); +	/* +	 * The owner value for a reader-owned lock is mostly for debugging +	 * purpose only and is not critical to the correct functioning of +	 * rwsem. So it is perfectly fine to set it in a preempt-enabled +	 * context here. +	 */  	__rwsem_set_reader_owned(sem, NULL);  }  EXPORT_SYMBOL(down_read_non_owner); diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 4523f99b0358..ab2376a1be88 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -494,7 +494,8 @@ unsigned long module_kallsyms_lookup_name(const char *name)  	return ret;  } -int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, +int module_kallsyms_on_each_symbol(const char *modname, +				   int (*fn)(void *, const char *,  					     struct module *, unsigned long),  				   void *data)  { @@ -509,6 +510,9 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,  		if (mod->state == MODULE_STATE_UNFORMED)  			continue; +		if (modname && strcmp(modname, mod->name)) +			continue; +  		/* Use rcu_dereference_sched() to remain compliant with the sparse tool */  		preempt_disable();  		kallsyms = rcu_dereference_sched(mod->kallsyms); @@ -525,6 +529,13 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,  			if (ret != 0)  				goto out;  		} + +		/* +		 * The given module is found, the subsequent modules do not +		 * need to be compared. +		 */ +		if (modname) +			break;  	}  out:  	mutex_unlock(&module_mutex); diff --git a/kernel/module/main.c b/kernel/module/main.c index 48568a0f5651..d3be89de706d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -17,6 +17,7 @@  #include <linux/fs.h>  #include <linux/kernel.h>  #include <linux/kernel_read_file.h> +#include <linux/kstrtox.h>  #include <linux/slab.h>  #include <linux/vmalloc.h>  #include <linux/elf.h> @@ -2393,7 +2394,8 @@ static bool finished_loading(const char *name)  	sched_annotate_sleep();  	mutex_lock(&module_mutex);  	mod = find_module_all(name, strlen(name), true); -	ret = !mod || mod->state == MODULE_STATE_LIVE; +	ret = !mod || mod->state == MODULE_STATE_LIVE +		|| mod->state == MODULE_STATE_GOING;  	mutex_unlock(&module_mutex);  	return ret; @@ -2569,20 +2571,35 @@ static int add_unformed_module(struct module *mod)  	mod->state = MODULE_STATE_UNFORMED; -again:  	mutex_lock(&module_mutex);  	old = find_module_all(mod->name, strlen(mod->name), true);  	if (old != NULL) { -		if (old->state != MODULE_STATE_LIVE) { +		if (old->state == MODULE_STATE_COMING +		    || old->state == MODULE_STATE_UNFORMED) {  			/* Wait in case it fails to load. */  			mutex_unlock(&module_mutex);  			err = wait_event_interruptible(module_wq,  					       finished_loading(mod->name));  			if (err)  				goto out_unlocked; -			goto again; + +			/* The module might have gone in the meantime. */ +			mutex_lock(&module_mutex); +			old = find_module_all(mod->name, strlen(mod->name), +					      true);  		} -		err = -EEXIST; + +		/* +		 * We are here only when the same module was being loaded. Do +		 * not try to load it again right now. It prevents long delays +		 * caused by serialized module load failures. It might happen +		 * when more devices of the same type trigger load of +		 * a particular module. +		 */ +		if (old && old->state == MODULE_STATE_LIVE) +			err = -EEXIST; +		else +			err = -EBUSY;  		goto out;  	}  	mod_update_bounds(mod); @@ -2659,7 +2676,7 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,  	int ret;  	if (strcmp(param, "async_probe") == 0) { -		if (strtobool(val, &mod->async_probe_requested)) +		if (kstrtobool(val, &mod->async_probe_requested))  			mod->async_probe_requested = true;  		return 0;  	} diff --git a/kernel/notifier.c b/kernel/notifier.c index ab75637fd904..d353e4b5402d 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -456,7 +456,6 @@ int raw_notifier_call_chain(struct raw_notifier_head *nh,  }  EXPORT_SYMBOL_GPL(raw_notifier_call_chain); -#ifdef CONFIG_SRCU  /*   *	SRCU notifier chain routines.    Registration and unregistration   *	use a mutex, and call_chain is synchronized by SRCU (no locks). @@ -573,8 +572,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh)  }  EXPORT_SYMBOL_GPL(srcu_init_notifier_head); -#endif /* CONFIG_SRCU */ -  static ATOMIC_NOTIFIER_HEAD(die_chain);  int notrace notify_die(enum die_val val, const char *str, diff --git a/kernel/panic.c b/kernel/panic.c index 463c9295bc28..5cfea8302d23 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@  #include <linux/ratelimit.h>  #include <linux/debugfs.h>  #include <linux/sysfs.h> +#include <linux/context_tracking.h>  #include <trace/events/error_report.h>  #include <asm/sections.h> @@ -211,9 +212,6 @@ static void panic_print_sys_info(bool console_flush)  		return;  	} -	if (panic_print & PANIC_PRINT_ALL_CPU_BT) -		trigger_all_cpu_backtrace(); -  	if (panic_print & PANIC_PRINT_TASK_INFO)  		show_state(); @@ -243,6 +241,30 @@ void check_panic_on_warn(const char *origin)  		      origin, limit);  } +/* + * Helper that triggers the NMI backtrace (if set in panic_print) + * and then performs the secondary CPUs shutdown - we cannot have + * the NMI backtrace after the CPUs are off! + */ +static void panic_other_cpus_shutdown(bool crash_kexec) +{ +	if (panic_print & PANIC_PRINT_ALL_CPU_BT) +		trigger_all_cpu_backtrace(); + +	/* +	 * Note that smp_send_stop() is the usual SMP shutdown function, +	 * which unfortunately may not be hardened to work in a panic +	 * situation. If we want to do crash dump after notifier calls +	 * and kmsg_dump, we will need architecture dependent extra +	 * bits in addition to stopping other CPUs, hence we rely on +	 * crash_smp_send_stop() for that. +	 */ +	if (!crash_kexec) +		smp_send_stop(); +	else +		crash_smp_send_stop(); +} +  /**   *	panic - halt the system   *	@fmt: The text string to print @@ -333,23 +355,10 @@ void panic(const char *fmt, ...)  	 *  	 * Bypass the panic_cpu check and call __crash_kexec directly.  	 */ -	if (!_crash_kexec_post_notifiers) { +	if (!_crash_kexec_post_notifiers)  		__crash_kexec(NULL); -		/* -		 * Note smp_send_stop is the usual smp shutdown function, which -		 * unfortunately means it may not be hardened to work in a -		 * panic situation. -		 */ -		smp_send_stop(); -	} else { -		/* -		 * If we want to do crash dump after notifier calls and -		 * kmsg_dump, we will need architecture dependent extra -		 * works in addition to stopping other CPUs. -		 */ -		crash_smp_send_stop(); -	} +	panic_other_cpus_shutdown(_crash_kexec_post_notifiers);  	/*  	 * Run any panic handlers, including those that might need to @@ -679,6 +688,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,  void warn_slowpath_fmt(const char *file, int line, unsigned taint,  		       const char *fmt, ...)  { +	bool rcu = warn_rcu_enter();  	struct warn_args args;  	pr_warn(CUT_HERE); @@ -693,11 +703,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint,  	va_start(args.args, fmt);  	__warn(file, line, __builtin_return_address(0), taint, NULL, &args);  	va_end(args.args); +	warn_rcu_exit(rcu);  }  EXPORT_SYMBOL(warn_slowpath_fmt);  #else  void __warn_printk(const char *fmt, ...)  { +	bool rcu = warn_rcu_enter();  	va_list args;  	pr_warn(CUT_HERE); @@ -705,6 +717,7 @@ void __warn_printk(const char *fmt, ...)  	va_start(args, fmt);  	vprintk(fmt, args);  	va_end(args); +	warn_rcu_exit(rcu);  }  EXPORT_SYMBOL(__warn_printk);  #endif diff --git a/kernel/params.c b/kernel/params.c index 14d66070757b..6e34ca89ebae 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -4,6 +4,7 @@  */  #include <linux/kernel.h> +#include <linux/kstrtox.h>  #include <linux/string.h>  #include <linux/errno.h>  #include <linux/module.h> @@ -310,7 +311,7 @@ int param_set_bool(const char *val, const struct kernel_param *kp)  	if (!val) val = "1";  	/* One of =[yYnN01] */ -	return strtobool(val, kp->arg); +	return kstrtobool(val, kp->arg);  }  EXPORT_SYMBOL(param_set_bool); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f4f8cb0435b4..46e0d5a3f91f 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,7 @@  #include <linux/sched/task.h>  #include <linux/sched/signal.h>  #include <linux/idr.h> +#include "pid_sysctl.h"  static DEFINE_MUTEX(pid_caches_mutex);  static struct kmem_cache *pid_ns_cachep; @@ -110,6 +111,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns  	ns->ucounts = ucounts;  	ns->pid_allocated = PIDNS_ADDING; +	initialize_memfd_noexec_scope(ns); +  	return ns;  out_free_idr: @@ -244,7 +247,24 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  		set_current_state(TASK_INTERRUPTIBLE);  		if (pid_ns->pid_allocated == init_pids)  			break; +		/* +		 * Release tasks_rcu_exit_srcu to avoid following deadlock: +		 * +		 * 1) TASK A unshare(CLONE_NEWPID) +		 * 2) TASK A fork() twice -> TASK B (child reaper for new ns) +		 *    and TASK C +		 * 3) TASK B exits, kills TASK C, waits for TASK A to reap it +		 * 4) TASK A calls synchronize_rcu_tasks() +		 *                   -> synchronize_srcu(tasks_rcu_exit_srcu) +		 * 5) *DEADLOCK* +		 * +		 * It is considered safe to release tasks_rcu_exit_srcu here +		 * because we assume the current task can not be concurrently +		 * reaped at this point. +		 */ +		exit_tasks_rcu_stop();  		schedule(); +		exit_tasks_rcu_start();  	}  	__set_current_state(TASK_RUNNING); @@ -455,6 +475,8 @@ static __init int pid_namespaces_init(void)  #ifdef CONFIG_CHECKPOINT_RESTORE  	register_sysctl_paths(kern_path, pid_ns_ctl_table);  #endif + +	register_pid_ns_sysctl_table_vm();  	return 0;  } diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h new file mode 100644 index 000000000000..e22d072e1e24 --- /dev/null +++ b/kernel/pid_sysctl.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_PID_SYSCTL_H +#define LINUX_PID_SYSCTL_H + +#include <linux/pid_namespace.h> + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) +{ +	ns->memfd_noexec_scope = +		task_active_pid_ns(current)->memfd_noexec_scope; +} + +static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table, +	int write, void *buf, size_t *lenp, loff_t *ppos) +{ +	struct pid_namespace *ns = task_active_pid_ns(current); +	struct ctl_table table_copy; + +	if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) +		return -EPERM; + +	table_copy = *table; +	if (ns != &init_pid_ns) +		table_copy.data = &ns->memfd_noexec_scope; + +	/* +	 * set minimum to current value, the effect is only bigger +	 * value is accepted. +	 */ +	if (*(int *)table_copy.data > *(int *)table_copy.extra1) +		table_copy.extra1 = table_copy.data; + +	return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); +} + +static struct ctl_table pid_ns_ctl_table_vm[] = { +	{ +		.procname	= "memfd_noexec", +		.data		= &init_pid_ns.memfd_noexec_scope, +		.maxlen		= sizeof(init_pid_ns.memfd_noexec_scope), +		.mode		= 0644, +		.proc_handler	= pid_mfd_noexec_dointvec_minmax, +		.extra1		= SYSCTL_ZERO, +		.extra2		= SYSCTL_TWO, +	}, +	{ } +}; +static struct ctl_path vm_path[] = { { .procname = "vm", }, { } }; +static inline void register_pid_ns_sysctl_table_vm(void) +{ +	register_sysctl_paths(vm_path, pid_ns_ctl_table_vm); +} +#else +static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {} +static inline void set_memfd_noexec_scope(struct pid_namespace *ns) {} +static inline void register_pid_ns_sysctl_table_vm(void) {} +#endif + +#endif /* LINUX_PID_SYSCTL_H */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 60a1d3051cc7..4b31629c5be4 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -118,7 +118,6 @@ config PM_SLEEP  	def_bool y  	depends on SUSPEND || HIBERNATE_CALLBACKS  	select PM -	select SRCU  config PM_SLEEP_SMP  	def_bool y diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index f82111837b8d..7b44f5b89fa1 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -87,10 +87,7 @@ static void em_debug_create_pd(struct device *dev)  static void em_debug_remove_pd(struct device *dev)  { -	struct dentry *debug_dir; - -	debug_dir = debugfs_lookup(dev_name(dev), rootdir); -	debugfs_remove_recursive(debug_dir); +	debugfs_lookup_and_remove(dev_name(dev), rootdir);  }  static int __init em_debug_init(void) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 277434b6c0bf..36a1df48280c 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -581,7 +581,7 @@ static int save_image(struct swap_map_handle *handle,  	return ret;  } -/** +/*   * Structure used for CRC32.   */  struct crc_data { @@ -596,7 +596,7 @@ struct crc_data {  	unsigned char *unc[LZO_THREADS];          /* uncompressed data */  }; -/** +/*   * CRC32 update function that runs in its own thread.   */  static int crc32_threadfn(void *data) @@ -623,7 +623,7 @@ static int crc32_threadfn(void *data)  	}  	return 0;  } -/** +/*   * Structure used for LZO data compression.   */  struct cmp_data { @@ -640,7 +640,7 @@ struct cmp_data {  	unsigned char wrk[LZO1X_1_MEM_COMPRESS];  /* compression workspace */  }; -/** +/*   * Compression function that runs in its own thread.   */  static int lzo_compress_threadfn(void *data) @@ -948,9 +948,9 @@ out_finish:  	return error;  } -/** +/*   *	The following functions allow us to read data using a swap map - *	in a file-alike way + *	in a file-like way.   */  static void release_swap_reader(struct swap_map_handle *handle) @@ -1107,7 +1107,7 @@ static int load_image(struct swap_map_handle *handle,  	return ret;  } -/** +/*   * Structure used for LZO data decompression.   */  struct dec_data { @@ -1123,7 +1123,7 @@ struct dec_data {  	unsigned char cmp[LZO_CMP_SIZE];          /* compressed buffer */  }; -/** +/*   * Decompression function that runs in its own thread.   */  static int lzo_decompress_threadfn(void *data) diff --git a/kernel/printk/index.c b/kernel/printk/index.c index c85be186a783..a6b27526baaf 100644 --- a/kernel/printk/index.c +++ b/kernel/printk/index.c @@ -145,7 +145,7 @@ static void pi_create_file(struct module *mod)  #ifdef CONFIG_MODULES  static void pi_remove_file(struct module *mod)  { -	debugfs_remove(debugfs_lookup(pi_get_module_name(mod), dfs_index)); +	debugfs_lookup_and_remove(pi_get_module_name(mod), dfs_index);  }  static int pi_module_notify(struct notifier_block *nb, unsigned long op, diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index d947ca6c84f9..2a17704136f1 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -14,6 +14,21 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,  #ifdef CONFIG_PRINTK +#ifdef CONFIG_PRINTK_CALLER +#define PRINTK_PREFIX_MAX	48 +#else +#define PRINTK_PREFIX_MAX	32 +#endif + +/* + * the maximum size of a formatted record (i.e. with prefix added + * per line and dropped messages or in extended message format) + */ +#define PRINTK_MESSAGE_MAX	2048 + +/* the maximum size allowed to be reserved for a record */ +#define PRINTKRB_RECORD_MAX	1024 +  /* Flags for a single printk record. */  enum printk_info_flags {  	LOG_NEWLINE	= 2,	/* text ended with a newline */ @@ -48,6 +63,10 @@ u16 printk_parse_prefix(const char *text, int *level,  			enum printk_info_flags *flags);  #else +#define PRINTK_PREFIX_MAX	0 +#define PRINTK_MESSAGE_MAX	0 +#define PRINTKRB_RECORD_MAX	0 +  /*   * In !PRINTK builds we still export console_sem   * semaphore and some of console functions (console_unlock()/etc.), so @@ -58,3 +77,29 @@ u16 printk_parse_prefix(const char *text, int *level,  static inline bool printk_percpu_data_ready(void) { return false; }  #endif /* CONFIG_PRINTK */ + +/** + * struct printk_buffers - Buffers to read/format/output printk messages. + * @outbuf:	After formatting, contains text to output. + * @scratchbuf:	Used as temporary ringbuffer reading and string-print space. + */ +struct printk_buffers { +	char	outbuf[PRINTK_MESSAGE_MAX]; +	char	scratchbuf[PRINTKRB_RECORD_MAX]; +}; + +/** + * struct printk_message - Container for a prepared printk message. + * @pbufs:	printk buffers used to prepare the message. + * @outbuf_len:	The length of prepared text in @pbufs->outbuf to output. This + *		does not count the terminator. A value of 0 means there is + *		nothing to output and this record should be skipped. + * @seq:	The sequence number of the record used for @pbufs->outbuf. + * @dropped:	The number of dropped records from reading @seq. + */ +struct printk_message { +	struct printk_buffers	*pbufs; +	unsigned int		outbuf_len; +	u64			seq; +	unsigned long		dropped; +}; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7decf1e9c486..fd0c9f913940 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -123,6 +123,7 @@ bool console_srcu_read_lock_is_held(void)  {  	return srcu_read_lock_held(&console_srcu);  } +EXPORT_SYMBOL(console_srcu_read_lock_is_held);  #endif  enum devkmsg_log_bits { @@ -465,21 +466,6 @@ static struct latched_seq clear_seq = {  	.val[1]		= 0,  }; -#ifdef CONFIG_PRINTK_CALLER -#define PREFIX_MAX		48 -#else -#define PREFIX_MAX		32 -#endif - -/* the maximum size of a formatted record (i.e. with prefix added per line) */ -#define CONSOLE_LOG_MAX		1024 - -/* the maximum size for a dropped text message */ -#define DROPPED_TEXT_MAX	64 - -/* the maximum size allowed to be reserved for a record */ -#define LOG_LINE_MAX		(CONSOLE_LOG_MAX - PREFIX_MAX) -  #define LOG_LEVEL(v)		((v) & 0x07)  #define LOG_FACILITY(v)		((v) >> 3 & 0xff) @@ -710,16 +696,15 @@ out:  	return len;  } +static bool printk_get_next_message(struct printk_message *pmsg, u64 seq, +				    bool is_extended, bool may_supress); +  /* /dev/kmsg - userspace message inject/listen interface */  struct devkmsg_user {  	atomic64_t seq;  	struct ratelimit_state rs;  	struct mutex lock; -	char buf[CONSOLE_EXT_LOG_MAX]; - -	struct printk_info info; -	char text_buf[CONSOLE_EXT_LOG_MAX]; -	struct printk_record record; +	struct printk_buffers pbufs;  };  static __printf(3, 4) __cold @@ -745,7 +730,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)  	size_t len = iov_iter_count(from);  	ssize_t ret = len; -	if (!user || len > LOG_LINE_MAX) +	if (!user || len > PRINTKRB_RECORD_MAX)  		return -EINVAL;  	/* Ignore when user logging is disabled. */ @@ -801,8 +786,10 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  			    size_t count, loff_t *ppos)  {  	struct devkmsg_user *user = file->private_data; -	struct printk_record *r = &user->record; -	size_t len; +	char *outbuf = &user->pbufs.outbuf[0]; +	struct printk_message pmsg = { +		.pbufs = &user->pbufs, +	};  	ssize_t ret;  	if (!user) @@ -812,7 +799,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  	if (ret)  		return ret; -	if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { +	if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {  		if (file->f_flags & O_NONBLOCK) {  			ret = -EAGAIN;  			goto out; @@ -829,36 +816,31 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  		 * This pairs with __wake_up_klogd:A.  		 */  		ret = wait_event_interruptible(log_wait, -				prb_read_valid(prb, -					atomic64_read(&user->seq), r)); /* LMM(devkmsg_read:A) */ +				printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, +							false)); /* LMM(devkmsg_read:A) */  		if (ret)  			goto out;  	} -	if (r->info->seq != atomic64_read(&user->seq)) { +	if (pmsg.dropped) {  		/* our last seen message is gone, return error and reset */ -		atomic64_set(&user->seq, r->info->seq); +		atomic64_set(&user->seq, pmsg.seq);  		ret = -EPIPE;  		goto out;  	} -	len = info_print_ext_header(user->buf, sizeof(user->buf), r->info); -	len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, -				  &r->text_buf[0], r->info->text_len, -				  &r->info->dev_info); - -	atomic64_set(&user->seq, r->info->seq + 1); +	atomic64_set(&user->seq, pmsg.seq + 1); -	if (len > count) { +	if (pmsg.outbuf_len > count) {  		ret = -EINVAL;  		goto out;  	} -	if (copy_to_user(buf, user->buf, len)) { +	if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {  		ret = -EFAULT;  		goto out;  	} -	ret = len; +	ret = pmsg.outbuf_len;  out:  	mutex_unlock(&user->lock);  	return ret; @@ -952,9 +934,6 @@ static int devkmsg_open(struct inode *inode, struct file *file)  	mutex_init(&user->lock); -	prb_rec_init_rd(&user->record, &user->info, -			&user->text_buf[0], sizeof(user->text_buf)); -  	atomic64_set(&user->seq, prb_first_valid_seq(prb));  	file->private_data = user; @@ -1149,7 +1128,7 @@ static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,  	return prb_record_text_space(&e);  } -static char setup_text_buf[LOG_LINE_MAX] __initdata; +static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;  void __init setup_log_buf(int early)  { @@ -1415,7 +1394,7 @@ static size_t record_print_text(struct printk_record *r, bool syslog,  	size_t text_len = r->info->text_len;  	size_t buf_size = r->text_buf_size;  	char *text = r->text_buf; -	char prefix[PREFIX_MAX]; +	char prefix[PRINTK_PREFIX_MAX];  	bool truncated = false;  	size_t prefix_len;  	size_t line_len; @@ -1514,7 +1493,7 @@ static size_t get_record_print_text_size(struct printk_info *info,  					 unsigned int line_count,  					 bool syslog, bool time)  { -	char prefix[PREFIX_MAX]; +	char prefix[PRINTK_PREFIX_MAX];  	size_t prefix_len;  	prefix_len = info_print_prefix(info, syslog, time, prefix); @@ -1580,11 +1559,11 @@ static int syslog_print(char __user *buf, int size)  	int len = 0;  	u64 seq; -	text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); +	text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);  	if (!text)  		return -ENOMEM; -	prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); +	prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);  	mutex_lock(&syslog_lock); @@ -1685,7 +1664,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  	u64 seq;  	bool time; -	text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); +	text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);  	if (!text)  		return -ENOMEM; @@ -1697,7 +1676,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  	seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,  				     size, true, time); -	prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); +	prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);  	len = 0;  	prb_for_each_record(seq, prb, seq, &r) { @@ -1891,6 +1870,7 @@ static void console_lock_spinning_enable(void)  /**   * console_lock_spinning_disable_and_check - mark end of code where another   *	thread was able to busy wait and check if there is a waiter + * @cookie: cookie returned from console_srcu_read_lock()   *   * This is called at the end of the section where spinning is allowed.   * It has two functions. First, it is a signal that it is no longer @@ -2011,27 +1991,6 @@ static int console_trylock_spinning(void)  }  /* - * Call the specified console driver, asking it to write out the specified - * text and length. If @dropped_text is non-NULL and any records have been - * dropped, a dropped message will be written out first. - */ -static void call_console_driver(struct console *con, const char *text, size_t len, -				char *dropped_text) -{ -	size_t dropped_len; - -	if (con->dropped && dropped_text) { -		dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX, -				       "** %lu printk messages dropped **\n", -				       con->dropped); -		con->dropped = 0; -		con->write(con, dropped_text, dropped_len); -	} - -	con->write(con, text, len); -} - -/*   * Recursion is tracked separately on each CPU. If NMIs are supported, an   * additional NMI context per CPU is also separately tracked. Until per-CPU   * is available, a separate "early tracking" is performed. @@ -2194,7 +2153,7 @@ static u16 printk_sprint(char *text, u16 size, int facility,  		}  	} -	trace_console_rcuidle(text, text_len); +	trace_console(text, text_len);  	return text_len;  } @@ -2241,8 +2200,8 @@ int vprintk_store(int facility, int level,  	reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;  	va_end(args2); -	if (reserve_size > LOG_LINE_MAX) -		reserve_size = LOG_LINE_MAX; +	if (reserve_size > PRINTKRB_RECORD_MAX) +		reserve_size = PRINTKRB_RECORD_MAX;  	/* Extract log level or control flags. */  	if (facility == 0) @@ -2256,7 +2215,7 @@ int vprintk_store(int facility, int level,  	if (flags & LOG_CONT) {  		prb_rec_init_wr(&r, reserve_size); -		if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { +		if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {  			text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,  						 facility, &flags, fmt, args);  			r.info->text_len += text_len; @@ -2387,8 +2346,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre  #else /* CONFIG_PRINTK */ -#define CONSOLE_LOG_MAX		0 -#define DROPPED_TEXT_MAX	0  #define printk_time		false  #define prb_read_valid(rb, seq, r)	false @@ -2412,10 +2369,6 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,  				  struct dev_printk_info *dev_info) { return 0; }  static void console_lock_spinning_enable(void) { }  static int console_lock_spinning_disable_and_check(int cookie) { return 0; } -static void call_console_driver(struct console *con, const char *text, size_t len, -				char *dropped_text) -{ -}  static bool suppress_message_printing(int level) { return false; }  static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }  static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } @@ -2742,16 +2695,136 @@ static void __console_unlock(void)  }  /* - * Print one record for the given console. The record printed is whatever - * record is the next available record for the given console. + * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". This + * is achieved by shifting the existing message over and inserting the dropped + * message.   * - * @text is a buffer of size CONSOLE_LOG_MAX. + * @pmsg is the printk message to prepend.   * - * If extended messages should be printed, @ext_text is a buffer of size - * CONSOLE_EXT_LOG_MAX. Otherwise @ext_text must be NULL. + * @dropped is the dropped count to report in the dropped message.   * - * If dropped messages should be printed, @dropped_text is a buffer of size - * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL. + * If the message text in @pmsg->pbufs->outbuf does not have enough space for + * the dropped message, the message text will be sufficiently truncated. + * + * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. + */ +#ifdef CONFIG_PRINTK +static void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) +{ +	struct printk_buffers *pbufs = pmsg->pbufs; +	const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); +	const size_t outbuf_sz = sizeof(pbufs->outbuf); +	char *scratchbuf = &pbufs->scratchbuf[0]; +	char *outbuf = &pbufs->outbuf[0]; +	size_t len; + +	len = scnprintf(scratchbuf, scratchbuf_sz, +		       "** %lu printk messages dropped **\n", dropped); + +	/* +	 * Make sure outbuf is sufficiently large before prepending. +	 * Keep at least the prefix when the message must be truncated. +	 * It is a rather theoretical problem when someone tries to +	 * use a minimalist buffer. +	 */ +	if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz)) +		return; + +	if (pmsg->outbuf_len + len >= outbuf_sz) { +		/* Truncate the message, but keep it terminated. */ +		pmsg->outbuf_len = outbuf_sz - (len + 1); +		outbuf[pmsg->outbuf_len] = 0; +	} + +	memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1); +	memcpy(outbuf, scratchbuf, len); +	pmsg->outbuf_len += len; +} +#else +#define console_prepend_dropped(pmsg, dropped) +#endif /* CONFIG_PRINTK */ + +/* + * Read and format the specified record (or a later record if the specified + * record is not available). + * + * @pmsg will contain the formatted result. @pmsg->pbufs must point to a + * struct printk_buffers. + * + * @seq is the record to read and format. If it is not available, the next + * valid record is read. + * + * @is_extended specifies if the message should be formatted for extended + * console output. + * + * @may_supress specifies if records may be skipped based on loglevel. + * + * Returns false if no record is available. Otherwise true and all fields + * of @pmsg are valid. (See the documentation of struct printk_message + * for information about the @pmsg fields.) + */ +static bool printk_get_next_message(struct printk_message *pmsg, u64 seq, +				    bool is_extended, bool may_suppress) +{ +	static int panic_console_dropped; + +	struct printk_buffers *pbufs = pmsg->pbufs; +	const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); +	const size_t outbuf_sz = sizeof(pbufs->outbuf); +	char *scratchbuf = &pbufs->scratchbuf[0]; +	char *outbuf = &pbufs->outbuf[0]; +	struct printk_info info; +	struct printk_record r; +	size_t len = 0; + +	/* +	 * Formatting extended messages requires a separate buffer, so use the +	 * scratch buffer to read in the ringbuffer text. +	 * +	 * Formatting normal messages is done in-place, so read the ringbuffer +	 * text directly into the output buffer. +	 */ +	if (is_extended) +		prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz); +	else +		prb_rec_init_rd(&r, &info, outbuf, outbuf_sz); + +	if (!prb_read_valid(prb, seq, &r)) +		return false; + +	pmsg->seq = r.info->seq; +	pmsg->dropped = r.info->seq - seq; + +	/* +	 * Check for dropped messages in panic here so that printk +	 * suppression can occur as early as possible if necessary. +	 */ +	if (pmsg->dropped && +	    panic_in_progress() && +	    panic_console_dropped++ > 10) { +		suppress_panic_printk = 1; +		pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); +	} + +	/* Skip record that has level above the console loglevel. */ +	if (may_suppress && suppress_message_printing(r.info->level)) +		goto out; + +	if (is_extended) { +		len = info_print_ext_header(outbuf, outbuf_sz, r.info); +		len += msg_print_ext_body(outbuf + len, outbuf_sz - len, +					  &r.text_buf[0], r.info->text_len, &r.info->dev_info); +	} else { +		len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); +	} +out: +	pmsg->outbuf_len = len; +	return true; +} + +/* + * Print one record for the given console. The record printed is whatever + * record is the next available record for the given console.   *   * @handover will be set to true if a printk waiter has taken over the   * console_lock, in which case the caller is no longer holding both the @@ -2764,46 +2837,33 @@ static void __console_unlock(void)   *   * Requires the console_lock and the SRCU read lock.   */ -static bool console_emit_next_record(struct console *con, char *text, char *ext_text, -				     char *dropped_text, bool *handover, int cookie) +static bool console_emit_next_record(struct console *con, bool *handover, int cookie)  { -	static int panic_console_dropped; -	struct printk_info info; -	struct printk_record r; -	unsigned long flags; -	char *write_text; -	size_t len; +	static struct printk_buffers pbufs; -	prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); +	bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; +	char *outbuf = &pbufs.outbuf[0]; +	struct printk_message pmsg = { +		.pbufs = &pbufs, +	}; +	unsigned long flags;  	*handover = false; -	if (!prb_read_valid(prb, con->seq, &r)) +	if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))  		return false; -	if (con->seq != r.info->seq) { -		con->dropped += r.info->seq - con->seq; -		con->seq = r.info->seq; -		if (panic_in_progress() && panic_console_dropped++ > 10) { -			suppress_panic_printk = 1; -			pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n"); -		} -	} +	con->dropped += pmsg.dropped; -	/* Skip record that has level above the console loglevel. */ -	if (suppress_message_printing(r.info->level)) { -		con->seq++; +	/* Skip messages of formatted length 0. */ +	if (pmsg.outbuf_len == 0) { +		con->seq = pmsg.seq + 1;  		goto skip;  	} -	if (ext_text) { -		write_text = ext_text; -		len = info_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, r.info); -		len += msg_print_ext_body(ext_text + len, CONSOLE_EXT_LOG_MAX - len, -					  &r.text_buf[0], r.info->text_len, &r.info->dev_info); -	} else { -		write_text = text; -		len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); +	if (con->dropped && !is_extended) { +		console_prepend_dropped(&pmsg, con->dropped); +		con->dropped = 0;  	}  	/* @@ -2819,11 +2879,15 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_  	printk_safe_enter_irqsave(flags);  	console_lock_spinning_enable(); -	stop_critical_timings();	/* don't trace print latency */ -	call_console_driver(con, write_text, len, dropped_text); +	/* Do not trace print latency. */ +	stop_critical_timings(); + +	/* Write everything out to the hardware. */ +	con->write(con, outbuf, pmsg.outbuf_len); +  	start_critical_timings(); -	con->seq++; +	con->seq = pmsg.seq + 1;  	*handover = console_lock_spinning_disable_and_check(cookie);  	printk_safe_exit_irqrestore(flags); @@ -2856,9 +2920,6 @@ skip:   */  static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)  { -	static char dropped_text[DROPPED_TEXT_MAX]; -	static char ext_text[CONSOLE_EXT_LOG_MAX]; -	static char text[CONSOLE_LOG_MAX];  	bool any_usable = false;  	struct console *con;  	bool any_progress; @@ -2878,16 +2939,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove  				continue;  			any_usable = true; -			if (console_srcu_read_flags(con) & CON_EXTENDED) { -				/* Extended consoles do not print "dropped messages". */ -				progress = console_emit_next_record(con, &text[0], -								    &ext_text[0], NULL, -								    handover, cookie); -			} else { -				progress = console_emit_next_record(con, &text[0], -								    NULL, &dropped_text[0], -								    handover, cookie); -			} +			progress = console_emit_next_record(con, handover, cookie);  			/*  			 * If a handover has occurred, the SRCU read lock diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54482193e1ed..0786450074c1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -813,7 +813,7 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,  {  	struct ptrace_rseq_configuration conf = {  		.rseq_abi_pointer = (u64)(uintptr_t)task->rseq, -		.rseq_abi_size = sizeof(*task->rseq), +		.rseq_abi_size = task->rseq_len,  		.signature = task->rseq_sig,  		.flags = 0,  	}; diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 232e29fe3e5e..2984de629f74 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -82,7 +82,7 @@ config RCU_CPU_STALL_TIMEOUT  config RCU_EXP_CPU_STALL_TIMEOUT  	int "Expedited RCU CPU stall timeout in milliseconds"  	depends on RCU_STALL_COMMON -	range 0 21000 +	range 0 300000  	default 0  	help  	  If a given expedited RCU grace period extends more than the @@ -92,6 +92,19 @@ config RCU_EXP_CPU_STALL_TIMEOUT  	  says to use the RCU_CPU_STALL_TIMEOUT value converted from  	  seconds to milliseconds. +config RCU_CPU_STALL_CPUTIME +	bool "Provide additional RCU stall debug information" +	depends on RCU_STALL_COMMON +	default n +	help +	  Collect statistics during the sampling period, such as the number of +	  (hard interrupts, soft interrupts, task switches) and the cputime of +	  (hard interrupts, soft interrupts, kernel tasks) are added to the +	  RCU stall report. For multiple continuous RCU stalls, all sampling +	  periods begin at half of the first RCU stall timeout. +	  The boot option rcupdate.rcu_cpu_stall_cputime has the same function +	  as this one, but will override this if it exists. +  config RCU_TRACE  	bool "Enable tracing for RCU"  	depends on DEBUG_KERNEL diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index c5aa934de59b..115616ac3bfa 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -224,6 +224,8 @@ extern int rcu_cpu_stall_ftrace_dump;  extern int rcu_cpu_stall_suppress;  extern int rcu_cpu_stall_timeout;  extern int rcu_exp_cpu_stall_timeout; +extern int rcu_cpu_stall_cputime; +extern bool rcu_exp_stall_task_details __read_mostly;  int rcu_jiffies_till_stall_check(void);  int rcu_exp_jiffies_till_stall_check(void); @@ -447,14 +449,20 @@ do {									\  /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */  static inline bool rcu_gp_is_normal(void) { return true; }  static inline bool rcu_gp_is_expedited(void) { return false; } +static inline bool rcu_async_should_hurry(void) { return false; }  static inline void rcu_expedite_gp(void) { }  static inline void rcu_unexpedite_gp(void) { } +static inline void rcu_async_hurry(void) { } +static inline void rcu_async_relax(void) { }  static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }  #else /* #ifdef CONFIG_TINY_RCU */  bool rcu_gp_is_normal(void);     /* Internal RCU use. */  bool rcu_gp_is_expedited(void);  /* Internal RCU use. */ +bool rcu_async_should_hurry(void);  /* Internal RCU use. */  void rcu_expedite_gp(void);  void rcu_unexpedite_gp(void); +void rcu_async_hurry(void); +void rcu_async_relax(void);  void rcupdate_announce_bootup_oddness(void);  #ifdef CONFIG_TASKS_RCU_GENERIC  void show_rcu_tasks_gp_kthreads(void); diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index c54ea2b6a36b..f71fac422c8f 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -89,7 +89,7 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)  }  /* Get the length of a segment of the rcu_segcblist structure. */ -static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) +long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg)  {  	return READ_ONCE(rsclp->seglen[seg]);  } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 431cee212467..4fe877f5f654 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,6 +15,8 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)  	return READ_ONCE(rclp->len);  } +long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg); +  /* Return number of callbacks in segmented callback list by summing seglen. */  long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 634df26a2c27..8e6c023212cb 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -399,7 +399,7 @@ static int torture_readlock_not_held(void)  	return rcu_read_lock_bh_held() || rcu_read_lock_sched_held();  } -static int rcu_torture_read_lock(void) __acquires(RCU) +static int rcu_torture_read_lock(void)  {  	rcu_read_lock();  	return 0; @@ -441,7 +441,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)  	}  } -static void rcu_torture_read_unlock(int idx) __releases(RCU) +static void rcu_torture_read_unlock(int idx)  {  	rcu_read_unlock();  } @@ -625,7 +625,7 @@ static struct srcu_struct srcu_ctld;  static struct srcu_struct *srcu_ctlp = &srcu_ctl;  static struct rcu_torture_ops srcud_ops; -static int srcu_torture_read_lock(void) __acquires(srcu_ctlp) +static int srcu_torture_read_lock(void)  {  	if (cur_ops == &srcud_ops)  		return srcu_read_lock_nmisafe(srcu_ctlp); @@ -652,7 +652,7 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)  	}  } -static void srcu_torture_read_unlock(int idx) __releases(srcu_ctlp) +static void srcu_torture_read_unlock(int idx)  {  	if (cur_ops == &srcud_ops)  		srcu_read_unlock_nmisafe(srcu_ctlp, idx); @@ -814,13 +814,13 @@ static void synchronize_rcu_trivial(void)  	}  } -static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +static int rcu_torture_read_lock_trivial(void)  {  	preempt_disable();  	return 0;  } -static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +static void rcu_torture_read_unlock_trivial(int idx)  {  	preempt_enable();  } diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 435c884c02b5..afa3e1a2f690 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -76,6 +76,8 @@ torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s");  // Wait until there are multiple CPUs before starting test.  torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,  	      "Holdoff time before test start (s)"); +// Number of typesafe_lookup structures, that is, the degree of concurrency. +torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures.");  // Number of loops per experiment, all readers execute operations concurrently.  torture_param(long, loops, 10000, "Number of loops per experiment.");  // Number of readers, with -1 defaulting to about 75% of the CPUs. @@ -124,7 +126,7 @@ static int exp_idx;  // Operations vector for selecting different types of tests.  struct ref_scale_ops { -	void (*init)(void); +	bool (*init)(void);  	void (*cleanup)(void);  	void (*readsection)(const int nloops);  	void (*delaysection)(const int nloops, const int udl, const int ndl); @@ -162,8 +164,9 @@ static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl  	}  } -static void rcu_sync_scale_init(void) +static bool rcu_sync_scale_init(void)  { +	return true;  }  static struct ref_scale_ops rcu_ops = { @@ -315,9 +318,10 @@ static struct ref_scale_ops refcnt_ops = {  // Definitions for rwlock  static rwlock_t test_rwlock; -static void ref_rwlock_init(void) +static bool ref_rwlock_init(void)  {  	rwlock_init(&test_rwlock); +	return true;  }  static void ref_rwlock_section(const int nloops) @@ -351,9 +355,10 @@ static struct ref_scale_ops rwlock_ops = {  // Definitions for rwsem  static struct rw_semaphore test_rwsem; -static void ref_rwsem_init(void) +static bool ref_rwsem_init(void)  {  	init_rwsem(&test_rwsem); +	return true;  }  static void ref_rwsem_section(const int nloops) @@ -523,6 +528,237 @@ static struct ref_scale_ops clock_ops = {  	.name		= "clock"  }; +//////////////////////////////////////////////////////////////////////// +// +// Methods leveraging SLAB_TYPESAFE_BY_RCU. +// + +// Item to look up in a typesafe manner.  Array of pointers to these. +struct refscale_typesafe { +	atomic_t rts_refctr;  // Used by all flavors +	spinlock_t rts_lock; +	seqlock_t rts_seqlock; +	unsigned int a; +	unsigned int b; +}; + +static struct kmem_cache *typesafe_kmem_cachep; +static struct refscale_typesafe **rtsarray; +static long rtsarray_size; +static DEFINE_TORTURE_RANDOM_PERCPU(refscale_rand); +static bool (*rts_acquire)(struct refscale_typesafe *rtsp, unsigned int *start); +static bool (*rts_release)(struct refscale_typesafe *rtsp, unsigned int start); + +// Conditionally acquire an explicit in-structure reference count. +static bool typesafe_ref_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ +	return atomic_inc_not_zero(&rtsp->rts_refctr); +} + +// Unconditionally release an explicit in-structure reference count. +static bool typesafe_ref_release(struct refscale_typesafe *rtsp, unsigned int start) +{ +	if (!atomic_dec_return(&rtsp->rts_refctr)) { +		WRITE_ONCE(rtsp->a, rtsp->a + 1); +		kmem_cache_free(typesafe_kmem_cachep, rtsp); +	} +	return true; +} + +// Unconditionally acquire an explicit in-structure spinlock. +static bool typesafe_lock_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ +	spin_lock(&rtsp->rts_lock); +	return true; +} + +// Unconditionally release an explicit in-structure spinlock. +static bool typesafe_lock_release(struct refscale_typesafe *rtsp, unsigned int start) +{ +	spin_unlock(&rtsp->rts_lock); +	return true; +} + +// Unconditionally acquire an explicit in-structure sequence lock. +static bool typesafe_seqlock_acquire(struct refscale_typesafe *rtsp, unsigned int *start) +{ +	*start = read_seqbegin(&rtsp->rts_seqlock); +	return true; +} + +// Conditionally release an explicit in-structure sequence lock.  Return +// true if this release was successful, that is, if no retry is required. +static bool typesafe_seqlock_release(struct refscale_typesafe *rtsp, unsigned int start) +{ +	return !read_seqretry(&rtsp->rts_seqlock, start); +} + +// Do a read-side critical section with the specified delay in +// microseconds and nanoseconds inserted so as to increase probability +// of failure. +static void typesafe_delay_section(const int nloops, const int udl, const int ndl) +{ +	unsigned int a; +	unsigned int b; +	int i; +	long idx; +	struct refscale_typesafe *rtsp; +	unsigned int start; + +	for (i = nloops; i >= 0; i--) { +		preempt_disable(); +		idx = torture_random(this_cpu_ptr(&refscale_rand)) % rtsarray_size; +		preempt_enable(); +retry: +		rcu_read_lock(); +		rtsp = rcu_dereference(rtsarray[idx]); +		a = READ_ONCE(rtsp->a); +		if (!rts_acquire(rtsp, &start)) { +			rcu_read_unlock(); +			goto retry; +		} +		if (a != READ_ONCE(rtsp->a)) { +			(void)rts_release(rtsp, start); +			rcu_read_unlock(); +			goto retry; +		} +		un_delay(udl, ndl); +		// Remember, seqlock read-side release can fail. +		if (!rts_release(rtsp, start)) { +			rcu_read_unlock(); +			goto retry; +		} +		b = READ_ONCE(rtsp->a); +		WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b); +		b = rtsp->b; +		rcu_read_unlock(); +		WARN_ON_ONCE(a * a != b); +	} +} + +// Because the acquisition and release methods are expensive, there +// is no point in optimizing away the un_delay() function's two checks. +// Thus simply define typesafe_read_section() as a simple wrapper around +// typesafe_delay_section(). +static void typesafe_read_section(const int nloops) +{ +	typesafe_delay_section(nloops, 0, 0); +} + +// Allocate and initialize one refscale_typesafe structure. +static struct refscale_typesafe *typesafe_alloc_one(void) +{ +	struct refscale_typesafe *rtsp; + +	rtsp = kmem_cache_alloc(typesafe_kmem_cachep, GFP_KERNEL); +	if (!rtsp) +		return NULL; +	atomic_set(&rtsp->rts_refctr, 1); +	WRITE_ONCE(rtsp->a, rtsp->a + 1); +	WRITE_ONCE(rtsp->b, rtsp->a * rtsp->a); +	return rtsp; +} + +// Slab-allocator constructor for refscale_typesafe structures created +// out of a new slab of system memory. +static void refscale_typesafe_ctor(void *rtsp_in) +{ +	struct refscale_typesafe *rtsp = rtsp_in; + +	spin_lock_init(&rtsp->rts_lock); +	seqlock_init(&rtsp->rts_seqlock); +	preempt_disable(); +	rtsp->a = torture_random(this_cpu_ptr(&refscale_rand)); +	preempt_enable(); +} + +static struct ref_scale_ops typesafe_ref_ops; +static struct ref_scale_ops typesafe_lock_ops; +static struct ref_scale_ops typesafe_seqlock_ops; + +// Initialize for a typesafe test. +static bool typesafe_init(void) +{ +	long idx; +	long si = lookup_instances; + +	typesafe_kmem_cachep = kmem_cache_create("refscale_typesafe", +						 sizeof(struct refscale_typesafe), sizeof(void *), +						 SLAB_TYPESAFE_BY_RCU, refscale_typesafe_ctor); +	if (!typesafe_kmem_cachep) +		return false; +	if (si < 0) +		si = -si * nr_cpu_ids; +	else if (si == 0) +		si = nr_cpu_ids; +	rtsarray_size = si; +	rtsarray = kcalloc(si, sizeof(*rtsarray), GFP_KERNEL); +	if (!rtsarray) +		return false; +	for (idx = 0; idx < rtsarray_size; idx++) { +		rtsarray[idx] = typesafe_alloc_one(); +		if (!rtsarray[idx]) +			return false; +	} +	if (cur_ops == &typesafe_ref_ops) { +		rts_acquire = typesafe_ref_acquire; +		rts_release = typesafe_ref_release; +	} else if (cur_ops == &typesafe_lock_ops) { +		rts_acquire = typesafe_lock_acquire; +		rts_release = typesafe_lock_release; +	} else if (cur_ops == &typesafe_seqlock_ops) { +		rts_acquire = typesafe_seqlock_acquire; +		rts_release = typesafe_seqlock_release; +	} else { +		WARN_ON_ONCE(1); +		return false; +	} +	return true; +} + +// Clean up after a typesafe test. +static void typesafe_cleanup(void) +{ +	long idx; + +	if (rtsarray) { +		for (idx = 0; idx < rtsarray_size; idx++) +			kmem_cache_free(typesafe_kmem_cachep, rtsarray[idx]); +		kfree(rtsarray); +		rtsarray = NULL; +		rtsarray_size = 0; +	} +	kmem_cache_destroy(typesafe_kmem_cachep); +	typesafe_kmem_cachep = NULL; +	rts_acquire = NULL; +	rts_release = NULL; +} + +// The typesafe_init() function distinguishes these structures by address. +static struct ref_scale_ops typesafe_ref_ops = { +	.init		= typesafe_init, +	.cleanup	= typesafe_cleanup, +	.readsection	= typesafe_read_section, +	.delaysection	= typesafe_delay_section, +	.name		= "typesafe_ref" +}; + +static struct ref_scale_ops typesafe_lock_ops = { +	.init		= typesafe_init, +	.cleanup	= typesafe_cleanup, +	.readsection	= typesafe_read_section, +	.delaysection	= typesafe_delay_section, +	.name		= "typesafe_lock" +}; + +static struct ref_scale_ops typesafe_seqlock_ops = { +	.init		= typesafe_init, +	.cleanup	= typesafe_cleanup, +	.readsection	= typesafe_read_section, +	.delaysection	= typesafe_delay_section, +	.name		= "typesafe_seqlock" +}; +  static void rcu_scale_one_reader(void)  {  	if (readdelay <= 0) @@ -812,6 +1048,7 @@ ref_scale_init(void)  	static struct ref_scale_ops *scale_ops[] = {  		&rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops,  		&rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, +		&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,  	};  	if (!torture_init_begin(scale_type, verbose)) @@ -833,7 +1070,10 @@ ref_scale_init(void)  		goto unwind;  	}  	if (cur_ops->init) -		cur_ops->init(); +		if (!cur_ops->init()) { +			firsterr = -EUCLEAN; +			goto unwind; +		}  	ref_scale_print_module_parms(cur_ops, "Start of test"); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ca4b5dcec675..ab4ee58af84b 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -154,7 +154,7 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)   */  static inline bool srcu_invl_snp_seq(unsigned long s)  { -	return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ; +	return s == SRCU_SNP_INIT_SEQ;  }  /* @@ -469,24 +469,59 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)  	/*  	 * If the locks are the same as the unlocks, then there must have -	 * been no readers on this index at some time in between. This does -	 * not mean that there are no more readers, as one could have read -	 * the current index but not have incremented the lock counter yet. +	 * been no readers on this index at some point in this function. +	 * But there might be more readers, as a task might have read +	 * the current ->srcu_idx but not yet have incremented its CPU's +	 * ->srcu_lock_count[idx] counter.  In fact, it is possible +	 * that most of the tasks have been preempted between fetching +	 * ->srcu_idx and incrementing ->srcu_lock_count[idx].  And there +	 * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks +	 * in a system whose address space was fully populated with memory. +	 * Call this quantity Nt.  	 * -	 * So suppose that the updater is preempted here for so long -	 * that more than ULONG_MAX non-nested readers come and go in -	 * the meantime.  It turns out that this cannot result in overflow -	 * because if a reader modifies its unlock count after we read it -	 * above, then that reader's next load of ->srcu_idx is guaranteed -	 * to get the new value, which will cause it to operate on the -	 * other bank of counters, where it cannot contribute to the -	 * overflow of these counters.  This means that there is a maximum -	 * of 2*NR_CPUS increments, which cannot overflow given current -	 * systems, especially not on 64-bit systems. +	 * So suppose that the updater is preempted at this point in the +	 * code for a long time.  That now-preempted updater has already +	 * flipped ->srcu_idx (possibly during the preceding grace period), +	 * done an smp_mb() (again, possibly during the preceding grace +	 * period), and summed up the ->srcu_unlock_count[idx] counters. +	 * How many times can a given one of the aforementioned Nt tasks +	 * increment the old ->srcu_idx value's ->srcu_lock_count[idx] +	 * counter, in the absence of nesting?  	 * -	 * OK, how about nesting?  This does impose a limit on nesting -	 * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, -	 * especially on 64-bit systems. +	 * It can clearly do so once, given that it has already fetched +	 * the old value of ->srcu_idx and is just about to use that value +	 * to index its increment of ->srcu_lock_count[idx].  But as soon as +	 * it leaves that SRCU read-side critical section, it will increment +	 * ->srcu_unlock_count[idx], which must follow the updater's above +	 * read from that same value.  Thus, as soon the reading task does +	 * an smp_mb() and a later fetch from ->srcu_idx, that task will be +	 * guaranteed to get the new index.  Except that the increment of +	 * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the +	 * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() +	 * is before the smp_mb().  Thus, that task might not see the new +	 * value of ->srcu_idx until the -second- __srcu_read_lock(), +	 * which in turn means that this task might well increment +	 * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, +	 * not just once. +	 * +	 * However, it is important to note that a given smp_mb() takes +	 * effect not just for the task executing it, but also for any +	 * later task running on that same CPU. +	 * +	 * That is, there can be almost Nt + Nc further increments of +	 * ->srcu_lock_count[idx] for the old index, where Nc is the number +	 * of CPUs.  But this is OK because the size of the task_struct +	 * structure limits the value of Nt and current systems limit Nc +	 * to a few thousand. +	 * +	 * OK, but what about nesting?  This does impose a limit on +	 * nesting of half of the size of the task_struct structure +	 * (measured in bytes), which should be sufficient.  A late 2022 +	 * TREE01 rcutorture run reported this size to be no less than +	 * 9408 bytes, allowing up to 4704 levels of nesting, which is +	 * comfortably beyond excessive.  Especially on 64-bit systems, +	 * which are unlikely to be configured with an address space fully +	 * populated with memory, at least not anytime soon.  	 */  	return srcu_readers_lock_idx(ssp, idx) == unlocks;  } @@ -726,7 +761,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)  	int state;  	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) -		sdp = per_cpu_ptr(ssp->sda, 0); +		sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());  	else  		sdp = this_cpu_ptr(ssp->sda);  	lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); @@ -837,7 +872,8 @@ static void srcu_gp_end(struct srcu_struct *ssp)  	/* Initiate callback invocation as needed. */  	ss_state = smp_load_acquire(&ssp->srcu_size_state);  	if (ss_state < SRCU_SIZE_WAIT_BARRIER) { -		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); +		srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()), +					cbdelay);  	} else {  		idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);  		srcu_for_each_node_breadth_first(ssp, snp) { @@ -914,7 +950,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp  	if (snp)  		for (; snp != NULL; snp = snp->srcu_parent) {  			sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); -			if (rcu_seq_done(&ssp->srcu_gp_seq, s) || +			if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) ||  			    (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))  				return;  			spin_lock_irqsave_rcu_node(snp, flags); @@ -941,6 +977,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp   *   * Note that this function also does the work of srcu_funnel_exp_start(),   * in some cases by directly invoking it. + * + * The srcu read lock should be hold around this function. And s is a seq snap + * after holding that lock.   */  static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,  				 unsigned long s, bool do_norm) @@ -961,7 +1000,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,  	if (snp_leaf)  		/* Each pass through the loop does one level of the srcu_node tree. */  		for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { -			if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) +			if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf)  				return; /* GP already done and CBs recorded. */  			spin_lock_irqsave_rcu_node(snp, flags);  			snp_seq = snp->srcu_have_cbs[idx]; @@ -998,8 +1037,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,  	if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))  		WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); -	/* If grace period not already done and none in progress, start it. */ -	if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && +	/* If grace period not already in progress, start it. */ +	if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) &&  	    rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {  		WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));  		srcu_gp_start(ssp); @@ -1059,10 +1098,11 @@ static void srcu_flip(struct srcu_struct *ssp)  	/*  	 * Ensure that if the updater misses an __srcu_read_unlock() -	 * increment, that task's next __srcu_read_lock() will see the -	 * above counter update.  Note that both this memory barrier -	 * and the one in srcu_readers_active_idx_check() provide the -	 * guarantee for __srcu_read_lock(). +	 * increment, that task's __srcu_read_lock() following its next +	 * __srcu_read_lock() or __srcu_read_unlock() will see the above +	 * counter update.  Note that both this memory barrier and the +	 * one in srcu_readers_active_idx_check() provide the guarantee +	 * for __srcu_read_lock().  	 */  	smp_mb(); /* D */  /* Pairs with C. */  } @@ -1161,7 +1201,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,  	idx = __srcu_read_lock_nmisafe(ssp);  	ss_state = smp_load_acquire(&ssp->srcu_size_state);  	if (ss_state < SRCU_SIZE_WAIT_CALL) -		sdp = per_cpu_ptr(ssp->sda, 0); +		sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());  	else  		sdp = raw_cpu_ptr(ssp->sda);  	spin_lock_irqsave_sdp_contention(sdp, &flags); @@ -1497,7 +1537,7 @@ void srcu_barrier(struct srcu_struct *ssp)  	idx = __srcu_read_lock_nmisafe(ssp);  	if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) -		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); +		srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda,	get_boot_cpu_id()));  	else  		for_each_possible_cpu(cpu)  			srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index fe9840d90e96..bfb5e1549f2b 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -384,6 +384,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)  {  	int cpu;  	unsigned long flags; +	bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);  	long n;  	long ncbs = 0;  	long ncbsnz = 0; @@ -425,21 +426,23 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)  			WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));  			smp_store_release(&rtp->percpu_enqueue_lim, 1);  			rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); +			gpdone = false;  			pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);  		}  		raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);  	} -	if (rcu_task_cb_adjust && !ncbsnz && -	    poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) { +	if (rcu_task_cb_adjust && !ncbsnz && gpdone) {  		raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);  		if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) {  			WRITE_ONCE(rtp->percpu_dequeue_lim, 1);  			pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name);  		} -		for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { -			struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); +		if (rtp->percpu_dequeue_lim == 1) { +			for (cpu = rtp->percpu_dequeue_lim; cpu < nr_cpu_ids; cpu++) { +				struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); -			WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); +				WARN_ON_ONCE(rcu_segcblist_n_cbs(&rtpcp->cblist)); +			}  		}  		raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags);  	} @@ -560,8 +563,9 @@ static int __noreturn rcu_tasks_kthread(void *arg)  static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)  {  	/* Complain if the scheduler has not started.  */ -	WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, -			 "synchronize_rcu_tasks called too soon"); +	if (WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, +			 "synchronize_%s() called too soon", rtp->name)) +		return;  	// If the grace-period kthread is running, use it.  	if (READ_ONCE(rtp->kthread_ptr)) { @@ -827,11 +831,21 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)  static void rcu_tasks_postscan(struct list_head *hop)  {  	/* -	 * Wait for tasks that are in the process of exiting.  This -	 * does only part of the job, ensuring that all tasks that were -	 * previously exiting reach the point where they have disabled -	 * preemption, allowing the later synchronize_rcu() to finish -	 * the job. +	 * Exiting tasks may escape the tasklist scan. Those are vulnerable +	 * until their final schedule() with TASK_DEAD state. To cope with +	 * this, divide the fragile exit path part in two intersecting +	 * read side critical sections: +	 * +	 * 1) An _SRCU_ read side starting before calling exit_notify(), +	 *    which may remove the task from the tasklist, and ending after +	 *    the final preempt_disable() call in do_exit(). +	 * +	 * 2) An _RCU_ read side starting with the final preempt_disable() +	 *    call in do_exit() and ending with the final call to schedule() +	 *    with TASK_DEAD state. +	 * +	 * This handles the part 1). And postgp will handle part 2) with a +	 * call to synchronize_rcu().  	 */  	synchronize_srcu(&tasks_rcu_exit_srcu);  } @@ -898,7 +912,10 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)  	 *  	 * In addition, this synchronize_rcu() waits for exiting tasks  	 * to complete their final preempt_disable() region of execution, -	 * cleaning up after the synchronize_srcu() above. +	 * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu), +	 * enforcing the whole region before tasklist removal until +	 * the final schedule() with TASK_DEAD state to be an RCU TASKS +	 * read side critical section.  	 */  	synchronize_rcu();  } @@ -988,27 +1005,42 @@ void show_rcu_tasks_classic_gp_kthread(void)  EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);  #endif // !defined(CONFIG_TINY_RCU) -/* Do the srcu_read_lock() for the above synchronize_srcu().  */ +/* + * Contribute to protect against tasklist scan blind spot while the + * task is exiting and may be removed from the tasklist. See + * corresponding synchronize_srcu() for further details. + */  void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)  { -	preempt_disable();  	current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); -	preempt_enable();  } -/* Do the srcu_read_unlock() for the above synchronize_srcu().  */ -void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) +/* + * Contribute to protect against tasklist scan blind spot while the + * task is exiting and may be removed from the tasklist. See + * corresponding synchronize_srcu() for further details. + */ +void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu)  {  	struct task_struct *t = current; -	preempt_disable();  	__srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); -	preempt_enable(); -	exit_tasks_rcu_finish_trace(t); +} + +/* + * Contribute to protect against tasklist scan blind spot while the + * task is exiting and may be removed from the tasklist. See + * corresponding synchronize_srcu() for further details. + */ +void exit_tasks_rcu_finish(void) +{ +	exit_tasks_rcu_stop(); +	exit_tasks_rcu_finish_trace(current);  }  #else /* #ifdef CONFIG_TASKS_RCU */  void exit_tasks_rcu_start(void) { } +void exit_tasks_rcu_stop(void) { }  void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }  #endif /* #else #ifdef CONFIG_TASKS_RCU */ @@ -1036,9 +1068,6 @@ static void rcu_tasks_be_rude(struct work_struct *work)  // Wait for one rude RCU-tasks grace period.  static void rcu_tasks_rude_wait_gp(struct rcu_tasks *rtp)  { -	if (num_online_cpus() <= 1) -		return;	// Fastpath for only one CPU. -  	rtp->n_ipis += cpumask_weight(cpu_online_mask);  	schedule_on_each_cpu(rcu_tasks_be_rude);  } @@ -1815,23 +1844,21 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)  static void rcu_tasks_initiate_self_tests(void)  { -	unsigned long j = jiffies; -  	pr_info("Running RCU-tasks wait API self tests\n");  #ifdef CONFIG_TASKS_RCU -	tests[0].runstart = j; +	tests[0].runstart = jiffies;  	synchronize_rcu_tasks();  	call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);  #endif  #ifdef CONFIG_TASKS_RUDE_RCU -	tests[1].runstart = j; +	tests[1].runstart = jiffies;  	synchronize_rcu_tasks_rude();  	call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);  #endif  #ifdef CONFIG_TASKS_TRACE_RCU -	tests[2].runstart = j; +	tests[2].runstart = jiffies;  	synchronize_rcu_tasks_trace();  	call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);  #endif diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 72913ce21258..42f7589e51e0 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -246,15 +246,12 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)  EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);  #ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, void *ptr)  { -	if (head) { -		void *ptr = (void *) head - (unsigned long) func; - +	if (head)  		kasan_record_aux_stack_noalloc(ptr); -	} -	__kvfree_call_rcu(head, func); +	__kvfree_call_rcu(head, ptr);  }  EXPORT_SYMBOL_GPL(kvfree_call_rcu);  #endif diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cf34a961821a..8e880c09ab59 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -144,14 +144,16 @@ static int rcu_scheduler_fully_active __read_mostly;  static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,  			      unsigned long gps, unsigned long flags); -static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);  static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);  static void invoke_rcu_core(void);  static void rcu_report_exp_rdp(struct rcu_data *rdp);  static void sync_sched_exp_online_cleanup(int cpu);  static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);  static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); +static bool rcu_rdp_cpu_online(struct rcu_data *rdp); +static bool rcu_init_invoked(void); +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);  /*   * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" @@ -215,27 +217,6 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);  #define PER_RCU_NODE_PERIOD 3	/* Number of grace periods between delays for debugging. */  /* - * Compute the mask of online CPUs for the specified rcu_node structure. - * This will not be stable unless the rcu_node structure's ->lock is - * held, but the bit corresponding to the current CPU will be stable - * in most contexts. - */ -static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) -{ -	return READ_ONCE(rnp->qsmaskinitnext); -} - -/* - * Is the CPU corresponding to the specified rcu_data structure online - * from RCU's perspective?  This perspective is given by that structure's - * ->qsmaskinitnext field rather than by the global cpu_online_mask. - */ -static bool rcu_rdp_cpu_online(struct rcu_data *rdp) -{ -	return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); -} - -/*   * Return true if an RCU grace period is in progress.  The READ_ONCE()s   * permit this function to be invoked without holding the root rcu_node   * structure's ->lock, but of course results can be subject to change. @@ -734,46 +715,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t)  	smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);  } -#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) - -/* - * Is the current CPU online as far as RCU is concerned? - * - * Disable preemption to avoid false positives that could otherwise - * happen due to the current CPU number being sampled, this task being - * preempted, its old CPU being taken offline, resuming on some other CPU, - * then determining that its old CPU is now offline. - * - * Disable checking if in an NMI handler because we cannot safely - * report errors from NMI handlers anyway.  In addition, it is OK to use - * RCU on an offline processor during initial boot, hence the check for - * rcu_scheduler_fully_active. - */ -bool rcu_lockdep_current_cpu_online(void) -{ -	struct rcu_data *rdp; -	bool ret = false; - -	if (in_nmi() || !rcu_scheduler_fully_active) -		return true; -	preempt_disable_notrace(); -	rdp = this_cpu_ptr(&rcu_data); -	/* -	 * Strictly, we care here about the case where the current CPU is -	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask -	 * not being up to date. So arch_spin_is_locked() might have a -	 * false positive if it's held by some *other* CPU, but that's -	 * OK because that just means a false *negative* on the warning. -	 */ -	if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) -		ret = true; -	preempt_enable_notrace(); -	return ret; -} -EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); - -#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ -  /*   * When trying to report a quiescent state on behalf of some other CPU,   * it is our responsibility to check for and handle potential overflow @@ -925,6 +866,24 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)  			rdp->rcu_iw_gp_seq = rnp->gp_seq;  			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);  		} + +		if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) { +			int cpu = rdp->cpu; +			struct rcu_snap_record *rsrp; +			struct kernel_cpustat *kcsp; + +			kcsp = &kcpustat_cpu(cpu); + +			rsrp = &rdp->snap_record; +			rsrp->cputime_irq     = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); +			rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); +			rsrp->cputime_system  = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); +			rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); +			rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); +			rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); +			rsrp->jiffies = jiffies; +			rsrp->gp_seq = rdp->gp_seq; +		}  	}  	return 0; @@ -1350,13 +1309,6 @@ static void rcu_strict_gp_boundary(void *unused)  	invoke_rcu_core();  } -// Has rcu_init() been invoked?  This is used (for example) to determine -// whether spinlocks may be acquired safely. -static bool rcu_init_invoked(void) -{ -	return !!rcu_state.n_online_cpus; -} -  // Make the polled API aware of the beginning of a grace period.  static void rcu_poll_gp_seq_start(unsigned long *snap)  { @@ -2092,92 +2044,6 @@ rcu_check_quiescent_state(struct rcu_data *rdp)  }  /* - * Near the end of the offline process.  Trace the fact that this CPU - * is going offline. - */ -int rcutree_dying_cpu(unsigned int cpu) -{ -	bool blkd; -	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); -	struct rcu_node *rnp = rdp->mynode; - -	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) -		return 0; - -	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); -	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), -			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); -	return 0; -} - -/* - * All CPUs for the specified rcu_node structure have gone offline, - * and all tasks that were preempted within an RCU read-side critical - * section while running on one of those CPUs have since exited their RCU - * read-side critical section.  Some other CPU is reporting this fact with - * the specified rcu_node structure's ->lock held and interrupts disabled. - * This function therefore goes up the tree of rcu_node structures, - * clearing the corresponding bits in the ->qsmaskinit fields.  Note that - * the leaf rcu_node structure's ->qsmaskinit field has already been - * updated. - * - * This function does check that the specified rcu_node structure has - * all CPUs offline and no blocked tasks, so it is OK to invoke it - * prematurely.  That said, invoking it after the fact will cost you - * a needless lock acquisition.  So once it has done its work, don't - * invoke it again. - */ -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) -{ -	long mask; -	struct rcu_node *rnp = rnp_leaf; - -	raw_lockdep_assert_held_rcu_node(rnp_leaf); -	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || -	    WARN_ON_ONCE(rnp_leaf->qsmaskinit) || -	    WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) -		return; -	for (;;) { -		mask = rnp->grpmask; -		rnp = rnp->parent; -		if (!rnp) -			break; -		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ -		rnp->qsmaskinit &= ~mask; -		/* Between grace periods, so better already be zero! */ -		WARN_ON_ONCE(rnp->qsmask); -		if (rnp->qsmaskinit) { -			raw_spin_unlock_rcu_node(rnp); -			/* irqs remain disabled. */ -			return; -		} -		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ -	} -} - -/* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context.  Do the remainder of the cleanup. - * There can only be one CPU hotplug operation at a time, so no need for - * explicit locking. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ -	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); -	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */ - -	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) -		return 0; - -	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); -	/* Adjust any no-longer-needed kthreads. */ -	rcu_boost_kthread_setaffinity(rnp, -1); -	// Stop-machine done, so allow nohz_full to disable tick. -	tick_dep_clear(TICK_DEP_BIT_RCU); -	return 0; -} - -/*   * Invoke any RCU callbacks that have made it to the end of their grace   * period.  Throttle as specified by rdp->blimit.   */ @@ -2209,7 +2075,7 @@ static void rcu_do_batch(struct rcu_data *rdp)  	 */  	rcu_nocb_lock_irqsave(rdp, flags);  	WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); -	pending = rcu_segcblist_n_cbs(&rdp->cblist); +	pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL);  	div = READ_ONCE(rcu_divisor);  	div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;  	bl = max(rdp->blimit, pending >> div); @@ -2727,10 +2593,11 @@ static void check_cb_ovld(struct rcu_data *rdp)  }  static void -__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)  {  	static atomic_t doublefrees;  	unsigned long flags; +	bool lazy;  	struct rcu_data *rdp;  	bool was_alldone; @@ -2755,6 +2622,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)  	kasan_record_aux_stack_noalloc(head);  	local_irq_save(flags);  	rdp = this_cpu_ptr(&rcu_data); +	lazy = lazy_in && !rcu_async_should_hurry();  	/* Add the callback to our list. */  	if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { @@ -2876,13 +2744,15 @@ EXPORT_SYMBOL_GPL(call_rcu);  /**   * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers + * @list: List node. All blocks are linked between each other + * @gp_snap: Snapshot of RCU state for objects placed to this bulk   * @nr_records: Number of active pointers in the array - * @next: Next bulk object in the block chain   * @records: Array of the kvfree_rcu() pointers   */  struct kvfree_rcu_bulk_data { +	struct list_head list; +	unsigned long gp_snap;  	unsigned long nr_records; -	struct kvfree_rcu_bulk_data *next;  	void *records[];  }; @@ -2898,26 +2768,28 @@ struct kvfree_rcu_bulk_data {   * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests   * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period   * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period + * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period   * @krcp: Pointer to @kfree_rcu_cpu structure   */  struct kfree_rcu_cpu_work {  	struct rcu_work rcu_work;  	struct rcu_head *head_free; -	struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; +	struct list_head bulk_head_free[FREE_N_CHANNELS];  	struct kfree_rcu_cpu *krcp;  };  /**   * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period   * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period + * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" + * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period   * @lock: Synchronize access to this structure   * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES   * @initialized: The @rcu_work fields have been initialized - * @count: Number of objects for which GP not started + * @head_count: Number of objects in rcu_head singular list + * @bulk_count: Number of objects in bulk-list   * @bkvcache:   *	A simple cache list that contains objects for reuse purpose.   *	In order to save some per-cpu space the list is singular. @@ -2935,13 +2807,20 @@ struct kfree_rcu_cpu_work {   * the interactions with the slab allocators.   */  struct kfree_rcu_cpu { +	// Objects queued on a linked list +	// through their rcu_head structures.  	struct rcu_head *head; -	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; +	unsigned long head_gp_snap; +	atomic_t head_count; + +	// Objects queued on a bulk-list. +	struct list_head bulk_head[FREE_N_CHANNELS]; +	atomic_t bulk_count[FREE_N_CHANNELS]; +  	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];  	raw_spinlock_t lock;  	struct delayed_work monitor_work;  	bool initialized; -	int count;  	struct delayed_work page_cache_work;  	atomic_t backoff_page_cache_fill; @@ -3029,29 +2908,87 @@ drain_page_cache(struct kfree_rcu_cpu *krcp)  	return freed;  } +static void +kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, +	struct kvfree_rcu_bulk_data *bnode, int idx) +{ +	unsigned long flags; +	int i; + +	debug_rcu_bhead_unqueue(bnode); + +	rcu_lock_acquire(&rcu_callback_map); +	if (idx == 0) { // kmalloc() / kfree(). +		trace_rcu_invoke_kfree_bulk_callback( +			rcu_state.name, bnode->nr_records, +			bnode->records); + +		kfree_bulk(bnode->nr_records, bnode->records); +	} else { // vmalloc() / vfree(). +		for (i = 0; i < bnode->nr_records; i++) { +			trace_rcu_invoke_kvfree_callback( +				rcu_state.name, bnode->records[i], 0); + +			vfree(bnode->records[i]); +		} +	} +	rcu_lock_release(&rcu_callback_map); + +	raw_spin_lock_irqsave(&krcp->lock, flags); +	if (put_cached_bnode(krcp, bnode)) +		bnode = NULL; +	raw_spin_unlock_irqrestore(&krcp->lock, flags); + +	if (bnode) +		free_page((unsigned long) bnode); + +	cond_resched_tasks_rcu_qs(); +} + +static void +kvfree_rcu_list(struct rcu_head *head) +{ +	struct rcu_head *next; + +	for (; head; head = next) { +		void *ptr = (void *) head->func; +		unsigned long offset = (void *) head - ptr; + +		next = head->next; +		debug_rcu_head_unqueue((struct rcu_head *)ptr); +		rcu_lock_acquire(&rcu_callback_map); +		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); + +		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) +			kvfree(ptr); + +		rcu_lock_release(&rcu_callback_map); +		cond_resched_tasks_rcu_qs(); +	} +} +  /*   * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bkvhead_free or ->head_free. + * It frees all the objects queued on ->bulk_head_free or ->head_free.   */  static void kfree_rcu_work(struct work_struct *work)  {  	unsigned long flags; -	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; -	struct rcu_head *head, *next; +	struct kvfree_rcu_bulk_data *bnode, *n; +	struct list_head bulk_head[FREE_N_CHANNELS]; +	struct rcu_head *head;  	struct kfree_rcu_cpu *krcp;  	struct kfree_rcu_cpu_work *krwp; -	int i, j; +	int i;  	krwp = container_of(to_rcu_work(work), -			    struct kfree_rcu_cpu_work, rcu_work); +		struct kfree_rcu_cpu_work, rcu_work);  	krcp = krwp->krcp;  	raw_spin_lock_irqsave(&krcp->lock, flags);  	// Channels 1 and 2. -	for (i = 0; i < FREE_N_CHANNELS; i++) { -		bkvhead[i] = krwp->bkvhead_free[i]; -		krwp->bkvhead_free[i] = NULL; -	} +	for (i = 0; i < FREE_N_CHANNELS; i++) +		list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);  	// Channel 3.  	head = krwp->head_free; @@ -3060,39 +2997,9 @@ static void kfree_rcu_work(struct work_struct *work)  	// Handle the first two channels.  	for (i = 0; i < FREE_N_CHANNELS; i++) { -		for (; bkvhead[i]; bkvhead[i] = bnext) { -			bnext = bkvhead[i]->next; -			debug_rcu_bhead_unqueue(bkvhead[i]); - -			rcu_lock_acquire(&rcu_callback_map); -			if (i == 0) { // kmalloc() / kfree(). -				trace_rcu_invoke_kfree_bulk_callback( -					rcu_state.name, bkvhead[i]->nr_records, -					bkvhead[i]->records); - -				kfree_bulk(bkvhead[i]->nr_records, -					bkvhead[i]->records); -			} else { // vmalloc() / vfree(). -				for (j = 0; j < bkvhead[i]->nr_records; j++) { -					trace_rcu_invoke_kvfree_callback( -						rcu_state.name, -						bkvhead[i]->records[j], 0); - -					vfree(bkvhead[i]->records[j]); -				} -			} -			rcu_lock_release(&rcu_callback_map); - -			raw_spin_lock_irqsave(&krcp->lock, flags); -			if (put_cached_bnode(krcp, bkvhead[i])) -				bkvhead[i] = NULL; -			raw_spin_unlock_irqrestore(&krcp->lock, flags); - -			if (bkvhead[i]) -				free_page((unsigned long) bkvhead[i]); - -			cond_resched_tasks_rcu_qs(); -		} +		// Start from the tail page, so a GP is likely passed for it. +		list_for_each_entry_safe(bnode, n, &bulk_head[i], list) +			kvfree_rcu_bulk(krcp, bnode, i);  	}  	/* @@ -3102,21 +3009,7 @@ static void kfree_rcu_work(struct work_struct *work)  	 * queued on a linked list through their rcu_head structures.  	 * This list is named "Channel 3".  	 */ -	for (; head; head = next) { -		unsigned long offset = (unsigned long)head->func; -		void *ptr = (void *)head - offset; - -		next = head->next; -		debug_rcu_head_unqueue((struct rcu_head *)ptr); -		rcu_lock_acquire(&rcu_callback_map); -		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - -		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) -			kvfree(ptr); - -		rcu_lock_release(&rcu_callback_map); -		cond_resched_tasks_rcu_qs(); -	} +	kvfree_rcu_list(head);  }  static bool @@ -3125,10 +3018,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)  	int i;  	for (i = 0; i < FREE_N_CHANNELS; i++) -		if (krcp->bkvhead[i]) +		if (!list_empty(&krcp->bulk_head[i]))  			return true; -	return !!krcp->head; +	return !!READ_ONCE(krcp->head); +} + +static int krc_count(struct kfree_rcu_cpu *krcp) +{ +	int sum = atomic_read(&krcp->head_count); +	int i; + +	for (i = 0; i < FREE_N_CHANNELS; i++) +		sum += atomic_read(&krcp->bulk_count[i]); + +	return sum;  }  static void @@ -3136,7 +3040,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)  {  	long delay, delay_left; -	delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; +	delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;  	if (delayed_work_pending(&krcp->monitor_work)) {  		delay_left = krcp->monitor_work.timer.expires - jiffies;  		if (delay < delay_left) @@ -3146,6 +3050,44 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)  	queue_delayed_work(system_wq, &krcp->monitor_work, delay);  } +static void +kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) +{ +	struct list_head bulk_ready[FREE_N_CHANNELS]; +	struct kvfree_rcu_bulk_data *bnode, *n; +	struct rcu_head *head_ready = NULL; +	unsigned long flags; +	int i; + +	raw_spin_lock_irqsave(&krcp->lock, flags); +	for (i = 0; i < FREE_N_CHANNELS; i++) { +		INIT_LIST_HEAD(&bulk_ready[i]); + +		list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { +			if (!poll_state_synchronize_rcu(bnode->gp_snap)) +				break; + +			atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); +			list_move(&bnode->list, &bulk_ready[i]); +		} +	} + +	if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { +		head_ready = krcp->head; +		atomic_set(&krcp->head_count, 0); +		WRITE_ONCE(krcp->head, NULL); +	} +	raw_spin_unlock_irqrestore(&krcp->lock, flags); + +	for (i = 0; i < FREE_N_CHANNELS; i++) { +		list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) +			kvfree_rcu_bulk(krcp, bnode, i); +	} + +	if (head_ready) +		kvfree_rcu_list(head_ready); +} +  /*   * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.   */ @@ -3156,26 +3098,31 @@ static void kfree_rcu_monitor(struct work_struct *work)  	unsigned long flags;  	int i, j; +	// Drain ready for reclaim. +	kvfree_rcu_drain_ready(krcp); +  	raw_spin_lock_irqsave(&krcp->lock, flags);  	// Attempt to start a new batch.  	for (i = 0; i < KFREE_N_BATCHES; i++) {  		struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); -		// Try to detach bkvhead or head and attach it over any +		// Try to detach bulk_head or head and attach it over any  		// available corresponding free channel. It can be that  		// a previous RCU batch is in progress, it means that  		// immediately to queue another one is not possible so  		// in that case the monitor work is rearmed. -		if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || -			(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || -				(krcp->head && !krwp->head_free)) { +		if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) || +			(!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) || +				(READ_ONCE(krcp->head) && !krwp->head_free)) { +  			// Channel 1 corresponds to the SLAB-pointer bulk path.  			// Channel 2 corresponds to vmalloc-pointer bulk path.  			for (j = 0; j < FREE_N_CHANNELS; j++) { -				if (!krwp->bkvhead_free[j]) { -					krwp->bkvhead_free[j] = krcp->bkvhead[j]; -					krcp->bkvhead[j] = NULL; +				if (list_empty(&krwp->bulk_head_free[j])) { +					atomic_set(&krcp->bulk_count[j], 0); +					list_replace_init(&krcp->bulk_head[j], +						&krwp->bulk_head_free[j]);  				}  			} @@ -3183,11 +3130,10 @@ static void kfree_rcu_monitor(struct work_struct *work)  			// objects queued on the linked list.  			if (!krwp->head_free) {  				krwp->head_free = krcp->head; -				krcp->head = NULL; +				atomic_set(&krcp->head_count, 0); +				WRITE_ONCE(krcp->head, NULL);  			} -			WRITE_ONCE(krcp->count, 0); -  			// One work is per one batch, so there are three  			// "free channels", the batch can handle. It can  			// be that the work is in the pending state when @@ -3197,6 +3143,8 @@ static void kfree_rcu_monitor(struct work_struct *work)  		}  	} +	raw_spin_unlock_irqrestore(&krcp->lock, flags); +  	// If there is nothing to detach, it means that our job is  	// successfully done here. In case of having at least one  	// of the channels that is still busy we should rearm the @@ -3204,8 +3152,6 @@ static void kfree_rcu_monitor(struct work_struct *work)  	// still in progress.  	if (need_offload_krc(krcp))  		schedule_delayed_monitor_work(krcp); - -	raw_spin_unlock_irqrestore(&krcp->lock, flags);  }  static enum hrtimer_restart @@ -3288,10 +3234,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,  		return false;  	idx = !!is_vmalloc_addr(ptr); +	bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], +		struct kvfree_rcu_bulk_data, list);  	/* Check if a new block is required. */ -	if (!(*krcp)->bkvhead[idx] || -			(*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { +	if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {  		bnode = get_cached_bnode(*krcp);  		if (!bnode && can_alloc) {  			krc_this_cpu_unlock(*krcp, *flags); @@ -3315,17 +3262,15 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,  		if (!bnode)  			return false; -		/* Initialize the new block. */ +		// Initialize the new block and attach it.  		bnode->nr_records = 0; -		bnode->next = (*krcp)->bkvhead[idx]; - -		/* Attach it to the head. */ -		(*krcp)->bkvhead[idx] = bnode; +		list_add(&bnode->list, &(*krcp)->bulk_head[idx]);  	} -	/* Finally insert. */ -	(*krcp)->bkvhead[idx]->records -		[(*krcp)->bkvhead[idx]->nr_records++] = ptr; +	// Finally insert and update the GP for this page. +	bnode->records[bnode->nr_records++] = ptr; +	bnode->gp_snap = get_state_synchronize_rcu(); +	atomic_inc(&(*krcp)->bulk_count[idx]);  	return true;  } @@ -3342,26 +3287,21 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,   * be free'd in workqueue context. This allows us to: batch requests together to   * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.   */ -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, void *ptr)  {  	unsigned long flags;  	struct kfree_rcu_cpu *krcp;  	bool success; -	void *ptr; -	if (head) { -		ptr = (void *) head - (unsigned long) func; -	} else { -		/* -		 * Please note there is a limitation for the head-less -		 * variant, that is why there is a clear rule for such -		 * objects: it can be used from might_sleep() context -		 * only. For other places please embed an rcu_head to -		 * your data. -		 */ +	/* +	 * Please note there is a limitation for the head-less +	 * variant, that is why there is a clear rule for such +	 * objects: it can be used from might_sleep() context +	 * only. For other places please embed an rcu_head to +	 * your data. +	 */ +	if (!head)  		might_sleep(); -		ptr = (unsigned long *) func; -	}  	// Queue the object but don't yet schedule the batch.  	if (debug_rcu_head_queue(ptr)) { @@ -3382,14 +3322,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)  			// Inline if kvfree_rcu(one_arg) call.  			goto unlock_return; -		head->func = func; +		head->func = ptr;  		head->next = krcp->head; -		krcp->head = head; +		WRITE_ONCE(krcp->head, head); +		atomic_inc(&krcp->head_count); + +		// Take a snapshot for this krcp. +		krcp->head_gp_snap = get_state_synchronize_rcu();  		success = true;  	} -	WRITE_ONCE(krcp->count, krcp->count + 1); -  	// Set timer to drain after KFREE_DRAIN_JIFFIES.  	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)  		schedule_delayed_monitor_work(krcp); @@ -3420,7 +3362,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)  	for_each_possible_cpu(cpu) {  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); -		count += READ_ONCE(krcp->count); +		count += krc_count(krcp);  		count += READ_ONCE(krcp->nr_bkv_objs);  		atomic_set(&krcp->backoff_page_cache_fill, 1);  	} @@ -3437,7 +3379,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)  		int count;  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); -		count = krcp->count; +		count = krc_count(krcp);  		count += drain_page_cache(krcp);  		kfree_rcu_monitor(&krcp->monitor_work.work); @@ -3461,15 +3403,12 @@ static struct shrinker kfree_rcu_shrinker = {  void __init kfree_rcu_scheduler_running(void)  {  	int cpu; -	unsigned long flags;  	for_each_possible_cpu(cpu) {  		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); -		raw_spin_lock_irqsave(&krcp->lock, flags);  		if (need_offload_krc(krcp))  			schedule_delayed_monitor_work(krcp); -		raw_spin_unlock_irqrestore(&krcp->lock, flags);  	}  } @@ -3485,9 +3424,10 @@ void __init kfree_rcu_scheduler_running(void)   */  static int rcu_blocking_is_gp(void)  { -	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) +	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) { +		might_sleep();  		return false; -	might_sleep();  /* Check for RCU read-side critical section. */ +	}  	return true;  } @@ -3711,7 +3651,9 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);   * If @false is returned, it is the caller's responsibility to invoke this   * function later on until it does return @true.  Alternatively, the caller   * can explicitly wait for a grace period, for example, by passing @oldstate - * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited() + * on the one hand or by directly invoking either synchronize_rcu() or + * synchronize_rcu_expedited() on the other.   *   * Yes, this function does not take counter wrap into account.   * But counter wrap is harmless.  If the counter wraps, we have waited for @@ -3722,6 +3664,12 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);   * completed.  Alternatively, they can use get_completed_synchronize_rcu()   * to get a guaranteed-completed grace-period state.   * + * In addition, because oldstate compresses the grace-period state for + * both normal and expedited grace periods into a single unsigned long, + * it can miss a grace period when synchronize_rcu() runs concurrently + * with synchronize_rcu_expedited().  If this is unacceptable, please + * instead use the _full() variant of these polling APIs. + *   * This function provides the same memory-ordering guarantees that   * would be provided by a synchronize_rcu() that was invoked at the call   * to the function that provided @oldstate, and that returned at the end @@ -4080,6 +4028,155 @@ retry:  EXPORT_SYMBOL_GPL(rcu_barrier);  /* + * Compute the mask of online CPUs for the specified rcu_node structure. + * This will not be stable unless the rcu_node structure's ->lock is + * held, but the bit corresponding to the current CPU will be stable + * in most contexts. + */ +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +{ +	return READ_ONCE(rnp->qsmaskinitnext); +} + +/* + * Is the CPU corresponding to the specified rcu_data structure online + * from RCU's perspective?  This perspective is given by that structure's + * ->qsmaskinitnext field rather than by the global cpu_online_mask. + */ +static bool rcu_rdp_cpu_online(struct rcu_data *rdp) +{ +	return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); +} + +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) + +/* + * Is the current CPU online as far as RCU is concerned? + * + * Disable preemption to avoid false positives that could otherwise + * happen due to the current CPU number being sampled, this task being + * preempted, its old CPU being taken offline, resuming on some other CPU, + * then determining that its old CPU is now offline. + * + * Disable checking if in an NMI handler because we cannot safely + * report errors from NMI handlers anyway.  In addition, it is OK to use + * RCU on an offline processor during initial boot, hence the check for + * rcu_scheduler_fully_active. + */ +bool rcu_lockdep_current_cpu_online(void) +{ +	struct rcu_data *rdp; +	bool ret = false; + +	if (in_nmi() || !rcu_scheduler_fully_active) +		return true; +	preempt_disable_notrace(); +	rdp = this_cpu_ptr(&rcu_data); +	/* +	 * Strictly, we care here about the case where the current CPU is +	 * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask +	 * not being up to date. So arch_spin_is_locked() might have a +	 * false positive if it's held by some *other* CPU, but that's +	 * OK because that just means a false *negative* on the warning. +	 */ +	if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) +		ret = true; +	preempt_enable_notrace(); +	return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ + +// Has rcu_init() been invoked?  This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ +	return !!rcu_state.n_online_cpus; +} + +/* + * Near the end of the offline process.  Trace the fact that this CPU + * is going offline. + */ +int rcutree_dying_cpu(unsigned int cpu) +{ +	bool blkd; +	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); +	struct rcu_node *rnp = rdp->mynode; + +	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) +		return 0; + +	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); +	trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), +			       blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); +	return 0; +} + +/* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section.  Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields.  Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated. + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely.  That said, invoking it after the fact will cost you + * a needless lock acquisition.  So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ +	long mask; +	struct rcu_node *rnp = rnp_leaf; + +	raw_lockdep_assert_held_rcu_node(rnp_leaf); +	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || +	    WARN_ON_ONCE(rnp_leaf->qsmaskinit) || +	    WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) +		return; +	for (;;) { +		mask = rnp->grpmask; +		rnp = rnp->parent; +		if (!rnp) +			break; +		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ +		rnp->qsmaskinit &= ~mask; +		/* Between grace periods, so better already be zero! */ +		WARN_ON_ONCE(rnp->qsmask); +		if (rnp->qsmaskinit) { +			raw_spin_unlock_rcu_node(rnp); +			/* irqs remain disabled. */ +			return; +		} +		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ +	} +} + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context.  Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. + */ +int rcutree_dead_cpu(unsigned int cpu) +{ +	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) +		return 0; + +	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); +	// Stop-machine done, so allow nohz_full to disable tick. +	tick_dep_clear(TICK_DEP_BIT_RCU); +	return 0; +} + +/*   * Propagate ->qsinitmask bits up the rcu_node tree to account for the   * first CPU in a given leaf rcu_node structure coming online.  The caller   * must hold the corresponding leaf rcu_node ->lock with interrupts @@ -4408,11 +4505,13 @@ static int rcu_pm_notify(struct notifier_block *self,  	switch (action) {  	case PM_HIBERNATION_PREPARE:  	case PM_SUSPEND_PREPARE: +		rcu_async_hurry();  		rcu_expedite_gp();  		break;  	case PM_POST_HIBERNATION:  	case PM_POST_SUSPEND:  		rcu_unexpedite_gp(); +		rcu_async_relax();  		break;  	default:  		break; @@ -4766,7 +4865,7 @@ struct workqueue_struct *rcu_gp_wq;  static void __init kfree_rcu_batch_init(void)  {  	int cpu; -	int i; +	int i, j;  	/* Clamp it to [0:100] seconds interval. */  	if (rcu_delay_page_cache_fill_msec < 0 || @@ -4786,8 +4885,14 @@ static void __init kfree_rcu_batch_init(void)  		for (i = 0; i < KFREE_N_BATCHES; i++) {  			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);  			krcp->krw_arr[i].krcp = krcp; + +			for (j = 0; j < FREE_N_CHANNELS; j++) +				INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);  		} +		for (i = 0; i < FREE_N_CHANNELS; i++) +			INIT_LIST_HEAD(&krcp->bulk_head[i]); +  		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);  		INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);  		krcp->initialized = true; @@ -4838,6 +4943,8 @@ void __init rcu_init(void)  	// Kick-start any polled grace periods that started early.  	if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))  		(void)start_poll_synchronize_rcu_expedited(); + +	rcu_test_sync_prims();  }  #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fcb5d696eb17..192536916f9a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -158,6 +158,23 @@ union rcu_noqs {  	u16 s; /* Set of bits, aggregate OR here. */  }; +/* + * Record the snapshot of the core stats at half of the first RCU stall timeout. + * The member gp_seq is used to ensure that all members are updated only once + * during the sampling period. The snapshot is taken only if this gp_seq is not + * equal to rdp->gp_seq. + */ +struct rcu_snap_record { +	unsigned long	gp_seq;		/* Track rdp->gp_seq counter */ +	u64		cputime_irq;	/* Accumulated cputime of hard irqs */ +	u64		cputime_softirq;/* Accumulated cputime of soft irqs */ +	u64		cputime_system; /* Accumulated cputime of kernel tasks */ +	unsigned long	nr_hardirqs;	/* Accumulated number of hard irqs */ +	unsigned int	nr_softirqs;	/* Accumulated number of soft irqs */ +	unsigned long long nr_csw;	/* Accumulated number of task switches */ +	unsigned long   jiffies;	/* Track jiffies value */ +}; +  /* Per-CPU data for read-copy update. */  struct rcu_data {  	/* 1) quiescent-state and grace-period handling : */ @@ -262,6 +279,8 @@ struct rcu_data {  	short rcu_onl_gp_flags;		/* ->gp_flags at last online. */  	unsigned long last_fqs_resched;	/* Time of last rcu_resched(). */  	unsigned long last_sched_clock;	/* Jiffies of last rcu_sched_clock_irq(). */ +	struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */ +					    /* the first RCU stall timeout */  	long lazy_len;			/* Length of buffered lazy callbacks. */  	int cpu; diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index ed6c3cce28f2..249c2967d9e6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -11,6 +11,7 @@  static void rcu_exp_handler(void *unused);  static int rcu_print_task_exp_stall(struct rcu_node *rnp); +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp);  /*   * Record the start of an expedited grace period. @@ -667,8 +668,11 @@ static void synchronize_rcu_expedited_wait(void)  				mask = leaf_node_cpu_bit(rnp, cpu);  				if (!(READ_ONCE(rnp->expmask) & mask))  					continue; +				preempt_disable(); // For smp_processor_id() in dump_cpu_task().  				dump_cpu_task(cpu); +				preempt_enable();  			} +			rcu_exp_print_detail_task_stall_rnp(rnp);  		}  		jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3;  		panic_on_rcu_stall(); @@ -811,6 +815,36 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)  	return ndetected;  } +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, dumping the stack of each that is blocking the current + * expedited grace period. + */ +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +	unsigned long flags; +	struct task_struct *t; + +	if (!rcu_exp_stall_task_details) +		return; +	raw_spin_lock_irqsave_rcu_node(rnp, flags); +	if (!READ_ONCE(rnp->exp_tasks)) { +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		return; +	} +	t = list_entry(rnp->exp_tasks->prev, +		       struct task_struct, rcu_node_entry); +	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { +		/* +		 * We could be printing a lot while holding a spinlock. +		 * Avoid triggering hard lockup. +		 */ +		touch_nmi_watchdog(); +		sched_show_task(t); +	} +	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} +  #else /* #ifdef CONFIG_PREEMPT_RCU */  /* Request an expedited quiescent state. */ @@ -883,6 +917,15 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)  	return 0;  } +/* + * Because preemptible RCU does not exist, we never have to print out + * tasks blocked within RCU read-side critical sections that are blocking + * the current expedited grace period. + */ +static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +} +  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */  /** diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 5653560573e2..b10b8349bb2a 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -39,7 +39,7 @@ int rcu_exp_jiffies_till_stall_check(void)  	// CONFIG_RCU_EXP_CPU_STALL_TIMEOUT, so check the allowed range.  	// The minimum clamped value is "2UL", because at least one full  	// tick has to be guaranteed. -	till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 21UL * HZ); +	till_stall_check = clamp(msecs_to_jiffies(cpu_stall_timeout), 2UL, 300UL * HZ);  	if (cpu_stall_timeout && jiffies_to_msecs(till_stall_check) != cpu_stall_timeout)  		WRITE_ONCE(rcu_exp_cpu_stall_timeout, jiffies_to_msecs(till_stall_check)); @@ -428,6 +428,35 @@ static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp  	return j > 2 * HZ;  } +static void print_cpu_stat_info(int cpu) +{ +	struct rcu_snap_record rsr, *rsrp; +	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); +	struct kernel_cpustat *kcsp = &kcpustat_cpu(cpu); + +	if (!rcu_cpu_stall_cputime) +		return; + +	rsrp = &rdp->snap_record; +	if (rsrp->gp_seq != rdp->gp_seq) +		return; + +	rsr.cputime_irq     = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); +	rsr.cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); +	rsr.cputime_system  = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); + +	pr_err("\t         hardirqs   softirqs   csw/system\n"); +	pr_err("\t number: %8ld %10d %12lld\n", +		kstat_cpu_irqs_sum(cpu) - rsrp->nr_hardirqs, +		kstat_cpu_softirqs_sum(cpu) - rsrp->nr_softirqs, +		nr_context_switches_cpu(cpu) - rsrp->nr_csw); +	pr_err("\tcputime: %8lld %10lld %12lld   ==> %d(ms)\n", +		div_u64(rsr.cputime_irq - rsrp->cputime_irq, NSEC_PER_MSEC), +		div_u64(rsr.cputime_softirq - rsrp->cputime_softirq, NSEC_PER_MSEC), +		div_u64(rsr.cputime_system - rsrp->cputime_system, NSEC_PER_MSEC), +		jiffies_to_msecs(jiffies - rsrp->jiffies)); +} +  /*   * Print out diagnostic information for the specified stalled CPU.   * @@ -484,6 +513,8 @@ static void print_cpu_stall_info(int cpu)  	       data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,  	       rcuc_starved ? buf : "",  	       falsepositive ? " (false positive?)" : ""); + +	print_cpu_stat_info(cpu);  }  /* Complain about starvation of grace-period kthread.  */ @@ -588,7 +619,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)  	for_each_possible_cpu(cpu)  		totqlen += rcu_get_n_cbs_cpu(cpu); -	pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", +	pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n",  	       smp_processor_id(), (long)(jiffies - gps),  	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus);  	if (ndetected) { @@ -649,7 +680,7 @@ static void print_cpu_stall(unsigned long gps)  	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);  	for_each_possible_cpu(cpu)  		totqlen += rcu_get_n_cbs_cpu(cpu); -	pr_cont("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", +	pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n",  		jiffies - gps,  		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index f5e6a2f95a2a..19bf6fa3ee6a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -144,8 +144,45 @@ bool rcu_gp_is_normal(void)  }  EXPORT_SYMBOL_GPL(rcu_gp_is_normal); -static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); +static atomic_t rcu_async_hurry_nesting = ATOMIC_INIT(1); +/* + * Should call_rcu() callbacks be processed with urgency or are + * they OK being executed with arbitrary delays? + */ +bool rcu_async_should_hurry(void) +{ +	return !IS_ENABLED(CONFIG_RCU_LAZY) || +	       atomic_read(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_should_hurry); + +/** + * rcu_async_hurry - Make future async RCU callbacks not lazy. + * + * After a call to this function, future calls to call_rcu() + * will be processed in a timely fashion. + */ +void rcu_async_hurry(void) +{ +	if (IS_ENABLED(CONFIG_RCU_LAZY)) +		atomic_inc(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_hurry); +/** + * rcu_async_relax - Make future async RCU callbacks lazy. + * + * After a call to this function, future calls to call_rcu() + * will be processed in a lazy fashion. + */ +void rcu_async_relax(void) +{ +	if (IS_ENABLED(CONFIG_RCU_LAZY)) +		atomic_dec(&rcu_async_hurry_nesting); +} +EXPORT_SYMBOL_GPL(rcu_async_relax); + +static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);  /*   * Should normal grace-period primitives be expedited?  Intended for   * use within RCU.  Note that this function takes the rcu_expedited @@ -195,6 +232,7 @@ static bool rcu_boot_ended __read_mostly;  void rcu_end_inkernel_boot(void)  {  	rcu_unexpedite_gp(); +	rcu_async_relax();  	if (rcu_normal_after_boot)  		WRITE_ONCE(rcu_normal, 1);  	rcu_boot_ended = true; @@ -220,6 +258,7 @@ void rcu_test_sync_prims(void)  {  	if (!IS_ENABLED(CONFIG_PROVE_RCU))  		return; +	pr_info("Running RCU synchronous self tests\n");  	synchronize_rcu();  	synchronize_rcu_expedited();  } @@ -508,6 +547,10 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;  module_param(rcu_cpu_stall_timeout, int, 0644);  int rcu_exp_cpu_stall_timeout __read_mostly = CONFIG_RCU_EXP_CPU_STALL_TIMEOUT;  module_param(rcu_exp_cpu_stall_timeout, int, 0644); +int rcu_cpu_stall_cputime __read_mostly = IS_ENABLED(CONFIG_RCU_CPU_STALL_CPUTIME); +module_param(rcu_cpu_stall_cputime, int, 0644); +bool rcu_exp_stall_task_details __read_mostly; +module_param(rcu_exp_stall_task_details, bool, 0644);  #endif /* #ifdef CONFIG_RCU_STALL_COMMON */  // Suppress boot-time RCU CPU stall warnings and rcutorture writer stall @@ -555,9 +598,12 @@ struct early_boot_kfree_rcu {  static void early_boot_test_call_rcu(void)  {  	static struct rcu_head head; +	int idx;  	static struct rcu_head shead;  	struct early_boot_kfree_rcu *rhp; +	idx = srcu_down_read(&early_srcu); +	srcu_up_read(&early_srcu, idx);  	call_rcu(&head, test_callback);  	early_srcu_cookie = start_poll_synchronize_srcu(&early_srcu);  	call_srcu(&early_srcu, &shead, test_callback); @@ -586,6 +632,7 @@ static int rcu_verify_early_boot_tests(void)  		early_boot_test_counter++;  		srcu_barrier(&early_srcu);  		WARN_ON_ONCE(!poll_state_synchronize_srcu(&early_srcu, early_srcu_cookie)); +		cleanup_srcu_struct(&early_srcu);  	}  	if (rcu_self_test_counter != early_boot_test_counter) {  		WARN_ON(1); diff --git a/kernel/relay.c b/kernel/relay.c index ef12532168d9..9aa70ae53d24 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -91,7 +91,7 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)  		return -EINVAL;  	vma->vm_ops = &relay_file_mmap_ops; -	vma->vm_flags |= VM_DONTEXPAND; +	vm_flags_set(vma, VM_DONTEXPAND);  	vma->vm_private_data = buf;  	return 0; diff --git a/kernel/resource.c b/kernel/resource.c index ddbbacb9fb50..b1763b2fd7ef 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1343,20 +1343,6 @@ retry:  			continue;  		} -		/* -		 * All memory regions added from memory-hotplug path have the -		 * flag IORESOURCE_SYSTEM_RAM. If the resource does not have -		 * this flag, we know that we are dealing with a resource coming -		 * from HMM/devm. HMM/devm use another mechanism to add/release -		 * a resource. This goes via devm_request_mem_region and -		 * devm_release_mem_region. -		 * HMM/devm take care to release their resources when they want, -		 * so if we are dealing with them, let us just back off here. -		 */ -		if (!(res->flags & IORESOURCE_SYSRAM)) { -			break; -		} -  		if (!(res->flags & IORESOURCE_MEM))  			break; diff --git a/kernel/rseq.c b/kernel/rseq.c index d38ab944105d..9de6e35fe679 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -18,6 +18,9 @@  #define CREATE_TRACE_POINTS  #include <trace/events/rseq.h> +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE		32 +  #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \  				  RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \  				  RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) @@ -82,15 +85,25 @@   *   F1. <failure>   */ -static int rseq_update_cpu_id(struct task_struct *t) +static int rseq_update_cpu_node_id(struct task_struct *t)  { -	u32 cpu_id = raw_smp_processor_id();  	struct rseq __user *rseq = t->rseq; +	u32 cpu_id = raw_smp_processor_id(); +	u32 node_id = cpu_to_node(cpu_id); +	u32 mm_cid = task_mm_cid(t); -	if (!user_write_access_begin(rseq, sizeof(*rseq))) +	WARN_ON_ONCE((int) mm_cid < 0); +	if (!user_write_access_begin(rseq, t->rseq_len))  		goto efault;  	unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);  	unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); +	unsafe_put_user(node_id, &rseq->node_id, efault_end); +	unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); +	/* +	 * Additional feature fields added after ORIG_RSEQ_SIZE +	 * need to be conditionally updated only if +	 * t->rseq_len != ORIG_RSEQ_SIZE. +	 */  	user_write_access_end();  	trace_rseq_update(t);  	return 0; @@ -101,9 +114,10 @@ efault:  	return -EFAULT;  } -static int rseq_reset_rseq_cpu_id(struct task_struct *t) +static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)  { -	u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; +	u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, +	    mm_cid = 0;  	/*  	 * Reset cpu_id_start to its initial state (0). @@ -117,6 +131,21 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t)  	 */  	if (put_user(cpu_id, &t->rseq->cpu_id))  		return -EFAULT; +	/* +	 * Reset node_id to its initial state (0). +	 */ +	if (put_user(node_id, &t->rseq->node_id)) +		return -EFAULT; +	/* +	 * Reset mm_cid to its initial state (0). +	 */ +	if (put_user(mm_cid, &t->rseq->mm_cid)) +		return -EFAULT; +	/* +	 * Additional feature fields added after ORIG_RSEQ_SIZE +	 * need to be conditionally reset only if +	 * t->rseq_len != ORIG_RSEQ_SIZE. +	 */  	return 0;  } @@ -301,7 +330,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)  		if (unlikely(ret < 0))  			goto error;  	} -	if (unlikely(rseq_update_cpu_id(t))) +	if (unlikely(rseq_update_cpu_node_id(t)))  		goto error;  	return; @@ -344,15 +373,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  		/* Unregister rseq for current thread. */  		if (current->rseq != rseq || !current->rseq)  			return -EINVAL; -		if (rseq_len != sizeof(*rseq)) +		if (rseq_len != current->rseq_len)  			return -EINVAL;  		if (current->rseq_sig != sig)  			return -EPERM; -		ret = rseq_reset_rseq_cpu_id(current); +		ret = rseq_reset_rseq_cpu_node_id(current);  		if (ret)  			return ret;  		current->rseq = NULL;  		current->rseq_sig = 0; +		current->rseq_len = 0;  		return 0;  	} @@ -365,7 +395,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  		 * the provided address differs from the prior  		 * one.  		 */ -		if (current->rseq != rseq || rseq_len != sizeof(*rseq)) +		if (current->rseq != rseq || rseq_len != current->rseq_len)  			return -EINVAL;  		if (current->rseq_sig != sig)  			return -EPERM; @@ -374,15 +404,24 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  	}  	/* -	 * If there was no rseq previously registered, -	 * ensure the provided rseq is properly aligned and valid. +	 * If there was no rseq previously registered, ensure the provided rseq +	 * is properly aligned, as communcated to user-space through the ELF +	 * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq +	 * size, the required alignment is the original struct rseq alignment. +	 * +	 * In order to be valid, rseq_len is either the original rseq size, or +	 * large enough to contain all supported fields, as communicated to +	 * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE.  	 */ -	if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || -	    rseq_len != sizeof(*rseq)) +	if (rseq_len < ORIG_RSEQ_SIZE || +	    (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || +	    (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || +					    rseq_len < offsetof(struct rseq, end))))  		return -EINVAL;  	if (!access_ok(rseq, rseq_len))  		return -EFAULT;  	current->rseq = rseq; +	current->rseq_len = rseq_len;  	current->rseq_sig = sig;  	/*  	 * If rseq was previously inactive, and has just been diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e374c0c923da..5732fa75ebab 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -93,7 +93,7 @@ struct sched_clock_data {  static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -notrace static inline struct sched_clock_data *this_scd(void) +static __always_inline struct sched_clock_data *this_scd(void)  {  	return this_cpu_ptr(&sched_clock_data);  } @@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late);   * min, max except they take wrapping into account   */ -notrace static inline u64 wrap_min(u64 x, u64 y) +static __always_inline u64 wrap_min(u64 x, u64 y)  {  	return (s64)(x - y) < 0 ? x : y;  } -notrace static inline u64 wrap_max(u64 x, u64 y) +static __always_inline u64 wrap_max(u64 x, u64 y)  {  	return (s64)(x - y) > 0 ? x : y;  } @@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x, u64 y)   *  - filter out backward motion   *  - use the GTOD tick value to create a window to filter crazy TSC values   */ -notrace static u64 sched_clock_local(struct sched_clock_data *scd) +static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)  {  	u64 now, clock, old_clock, min_clock, max_clock, gtod;  	s64 delta; @@ -287,13 +287,28 @@ again:  	clock = wrap_max(clock, min_clock);  	clock = wrap_min(clock, max_clock); -	if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) +	if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))  		goto again;  	return clock;  } -notrace static u64 sched_clock_remote(struct sched_clock_data *scd) +noinstr u64 local_clock(void) +{ +	u64 clock; + +	if (static_branch_likely(&__sched_clock_stable)) +		return sched_clock() + __sched_clock_offset; + +	preempt_disable_notrace(); +	clock = sched_clock_local(this_scd()); +	preempt_enable_notrace(); + +	return clock; +} +EXPORT_SYMBOL_GPL(local_clock); + +static notrace u64 sched_clock_remote(struct sched_clock_data *scd)  {  	struct sched_clock_data *my_scd = this_scd();  	u64 this_clock, remote_clock; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25b582b6ee5f..af017e038b48 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -152,7 +152,7 @@ __read_mostly int scheduler_running;  DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);  /* kernel prio, less is more */ -static inline int __task_prio(struct task_struct *p) +static inline int __task_prio(const struct task_struct *p)  {  	if (p->sched_class == &stop_sched_class) /* trumps deadline */  		return -2; @@ -174,7 +174,8 @@ static inline int __task_prio(struct task_struct *p)   */  /* real prio, less is less */ -static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +static inline bool prio_less(const struct task_struct *a, +			     const struct task_struct *b, bool in_fi)  {  	int pa = __task_prio(a), pb = __task_prio(b); @@ -194,7 +195,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool  	return false;  } -static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +static inline bool __sched_core_less(const struct task_struct *a, +				     const struct task_struct *b)  {  	if (a->core_cookie < b->core_cookie)  		return true; @@ -2604,27 +2606,71 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)  		.user_mask = NULL,  		.flags     = SCA_USER,	/* clear the user requested mask */  	}; +	union cpumask_rcuhead { +		cpumask_t cpumask; +		struct rcu_head rcu; +	};  	__do_set_cpus_allowed(p, &ac); -	kfree(ac.user_mask); + +	/* +	 * Because this is called with p->pi_lock held, it is not possible +	 * to use kfree() here (when PREEMPT_RT=y), therefore punt to using +	 * kfree_rcu(). +	 */ +	kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu); +} + +static cpumask_t *alloc_user_cpus_ptr(int node) +{ +	/* +	 * See do_set_cpus_allowed() above for the rcu_head usage. +	 */ +	int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); + +	return kmalloc_node(size, GFP_KERNEL, node);  }  int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,  		      int node)  { +	cpumask_t *user_mask;  	unsigned long flags; -	if (!src->user_cpus_ptr) +	/* +	 * Always clear dst->user_cpus_ptr first as their user_cpus_ptr's +	 * may differ by now due to racing. +	 */ +	dst->user_cpus_ptr = NULL; + +	/* +	 * This check is racy and losing the race is a valid situation. +	 * It is not worth the extra overhead of taking the pi_lock on +	 * every fork/clone. +	 */ +	if (data_race(!src->user_cpus_ptr))  		return 0; -	dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node); -	if (!dst->user_cpus_ptr) +	user_mask = alloc_user_cpus_ptr(node); +	if (!user_mask)  		return -ENOMEM; -	/* Use pi_lock to protect content of user_cpus_ptr */ +	/* +	 * Use pi_lock to protect content of user_cpus_ptr +	 * +	 * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent +	 * do_set_cpus_allowed(). +	 */  	raw_spin_lock_irqsave(&src->pi_lock, flags); -	cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); +	if (src->user_cpus_ptr) { +		swap(dst->user_cpus_ptr, user_mask); +		cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); +	}  	raw_spin_unlock_irqrestore(&src->pi_lock, flags); + +	if (unlikely(user_mask)) +		kfree(user_mask); +  	return 0;  } @@ -2907,8 +2953,11 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,  	}  	if (!(ctx->flags & SCA_MIGRATE_ENABLE)) { -		if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) +		if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) { +			if (ctx->flags & SCA_USER) +				swap(p->user_cpus_ptr, ctx->user_mask);  			goto out; +		}  		if (WARN_ON_ONCE(p == current &&  				 is_migration_disabled(p) && @@ -3581,6 +3630,11 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)  	return false;  } +static inline cpumask_t *alloc_user_cpus_ptr(int node) +{ +	return NULL; +} +  #endif /* !CONFIG_SMP */  static void @@ -3623,14 +3677,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)  }  /* - * Mark the task runnable and perform wakeup-preemption. + * Mark the task runnable.   */ -static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, -			   struct rq_flags *rf) +static inline void ttwu_do_wakeup(struct task_struct *p)  { -	check_preempt_curr(rq, p, wake_flags);  	WRITE_ONCE(p->__state, TASK_RUNNING);  	trace_sched_wakeup(p); +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, +		 struct rq_flags *rf) +{ +	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; + +	lockdep_assert_rq_held(rq); + +	if (p->sched_contributes_to_load) +		rq->nr_uninterruptible--; + +#ifdef CONFIG_SMP +	if (wake_flags & WF_MIGRATED) +		en_flags |= ENQUEUE_MIGRATED; +	else +#endif +	if (p->in_iowait) { +		delayacct_blkio_end(p); +		atomic_dec(&task_rq(p)->nr_iowait); +	} + +	activate_task(rq, p, en_flags); +	check_preempt_curr(rq, p, wake_flags); + +	ttwu_do_wakeup(p);  #ifdef CONFIG_SMP  	if (p->sched_class->task_woken) { @@ -3660,31 +3739,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,  #endif  } -static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, -		 struct rq_flags *rf) -{ -	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; - -	lockdep_assert_rq_held(rq); - -	if (p->sched_contributes_to_load) -		rq->nr_uninterruptible--; - -#ifdef CONFIG_SMP -	if (wake_flags & WF_MIGRATED) -		en_flags |= ENQUEUE_MIGRATED; -	else -#endif -	if (p->in_iowait) { -		delayacct_blkio_end(p); -		atomic_dec(&task_rq(p)->nr_iowait); -	} - -	activate_task(rq, p, en_flags); -	ttwu_do_wakeup(rq, p, wake_flags, rf); -} -  /*   * Consider @p being inside a wait loop:   * @@ -3718,9 +3772,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)  	rq = __task_rq_lock(p, &rf);  	if (task_on_rq_queued(p)) { -		/* check_preempt_curr() may use rq clock */ -		update_rq_clock(rq); -		ttwu_do_wakeup(rq, p, wake_flags, &rf); +		if (!task_on_cpu(rq, p)) { +			/* +			 * When on_rq && !on_cpu the task is preempted, see if +			 * it should preempt the task that is current now. +			 */ +			update_rq_clock(rq); +			check_preempt_curr(rq, p, wake_flags); +		} +		ttwu_do_wakeup(p);  		ret = 1;  	}  	__task_rq_unlock(rq, &rf); @@ -4086,8 +4146,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  			goto out;  		trace_sched_waking(p); -		WRITE_ONCE(p->__state, TASK_RUNNING); -		trace_sched_wakeup(p); +		ttwu_do_wakeup(p);  		goto out;  	} @@ -5052,6 +5111,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,  	sched_info_switch(rq, prev, next);  	perf_event_task_sched_out(prev, next);  	rseq_preempt(prev); +	switch_mm_cid(prev, next);  	fire_sched_out_preempt_notifiers(prev, next);  	kmap_local_sched_out();  	prepare_task(next); @@ -5282,6 +5342,11 @@ bool single_task_running(void)  }  EXPORT_SYMBOL(single_task_running); +unsigned long long nr_context_switches_cpu(int cpu) +{ +	return cpu_rq(cpu)->nr_switches; +} +  unsigned long long nr_context_switches(void)  {  	int i; @@ -5504,7 +5569,9 @@ void scheduler_tick(void)  	unsigned long thermal_pressure;  	u64 resched_latency; -	arch_scale_freq_tick(); +	if (housekeeping_cpu(cpu, HK_TYPE_TICK)) +		arch_scale_freq_tick(); +  	sched_clock_tick();  	rq_lock(rq, &rf); @@ -6206,7 +6273,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd)  {  	int i; -	for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { +	for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {  		if (i == cpu)  			continue; @@ -8239,12 +8306,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  	if (retval)  		goto out_put_task; -	user_mask = kmalloc(cpumask_size(), GFP_KERNEL); -	if (!user_mask) { +	/* +	 * With non-SMP configs, user_cpus_ptr/user_mask isn't used and +	 * alloc_user_cpus_ptr() returns NULL. +	 */ +	user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); +	if (user_mask) { +		cpumask_copy(user_mask, in_mask); +	} else if (IS_ENABLED(CONFIG_SMP)) {  		retval = -ENOMEM;  		goto out_put_task;  	} -	cpumask_copy(user_mask, in_mask); +  	ac = (struct affinity_context){  		.new_mask  = in_mask,  		.user_mask = user_mask, @@ -11305,3 +11378,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)  {          trace_sched_update_nr_running_tp(rq, count);  } + +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_exit_signals(struct task_struct *t) +{ +	struct mm_struct *mm = t->mm; +	unsigned long flags; + +	if (!mm) +		return; +	local_irq_save(flags); +	mm_cid_put(mm, t->mm_cid); +	t->mm_cid = -1; +	t->mm_cid_active = 0; +	local_irq_restore(flags); +} + +void sched_mm_cid_before_execve(struct task_struct *t) +{ +	struct mm_struct *mm = t->mm; +	unsigned long flags; + +	if (!mm) +		return; +	local_irq_save(flags); +	mm_cid_put(mm, t->mm_cid); +	t->mm_cid = -1; +	t->mm_cid_active = 0; +	local_irq_restore(flags); +} + +void sched_mm_cid_after_execve(struct task_struct *t) +{ +	struct mm_struct *mm = t->mm; +	unsigned long flags; + +	if (!mm) +		return; +	local_irq_save(flags); +	t->mm_cid = mm_cid_get(mm); +	t->mm_cid_active = 1; +	local_irq_restore(flags); +	rseq_set_notify_resume(t); +} + +void sched_mm_cid_fork(struct task_struct *t) +{ +	WARN_ON_ONCE(!t->mm || t->mm_cid != -1); +	t->mm_cid_active = 1; +} +#endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1207c78f85c1..e3211455b203 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -48,7 +48,6 @@ struct sugov_cpu {  	unsigned long		util;  	unsigned long		bw_dl; -	unsigned long		max;  	/* The field below is for single-CPU policies only: */  #ifdef CONFIG_NO_HZ_COMMON @@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)  {  	struct rq *rq = cpu_rq(sg_cpu->cpu); -	sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);  	sg_cpu->bw_dl = cpu_bw_dl(rq);  	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),  					  FREQUENCY_UTIL, NULL); @@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,   * sugov_iowait_apply() - Apply the IO boost to a CPU.   * @sg_cpu: the sugov data for the cpu to boost   * @time: the update time from the caller + * @max_cap: the max CPU capacity   *   * A CPU running a task which woken up after an IO operation can have its   * utilization boosted to speed up the completion of those IO operations. @@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,   * This mechanism is designed to boost high frequently IO waiting tasks, while   * being more conservative on tasks which does sporadic IO operations.   */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, +			       unsigned long max_cap)  {  	unsigned long boost; @@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)  	 * sg_cpu->util is already in capacity scale; convert iowait_boost  	 * into the same scale so we can compare.  	 */ -	boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; +	boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;  	boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);  	if (sg_cpu->util < boost)  		sg_cpu->util = boost; @@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)  }  static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, -					      u64 time, unsigned int flags) +					      u64 time, unsigned long max_cap, +					      unsigned int flags)  {  	sugov_iowait_boost(sg_cpu, time, flags);  	sg_cpu->last_update = time; @@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,  		return false;  	sugov_get_util(sg_cpu); -	sugov_iowait_apply(sg_cpu, time); +	sugov_iowait_apply(sg_cpu, time, max_cap);  	return true;  } @@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,  	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);  	struct sugov_policy *sg_policy = sg_cpu->sg_policy;  	unsigned int cached_freq = sg_policy->cached_raw_freq; +	unsigned long max_cap;  	unsigned int next_f; -	if (!sugov_update_single_common(sg_cpu, time, flags)) +	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + +	if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))  		return; -	next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); +	next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);  	/*  	 * Do not reduce the frequency if the CPU has not been idle  	 * recently, as the reduction is likely to be premature then. @@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,  {  	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);  	unsigned long prev_util = sg_cpu->util; +	unsigned long max_cap;  	/*  	 * Fall back to the "frequency" path if frequency invariance is not @@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,  		return;  	} -	if (!sugov_update_single_common(sg_cpu, time, flags)) +	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + +	if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))  		return;  	/* @@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,  		sg_cpu->util = prev_util;  	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), -				   map_util_perf(sg_cpu->util), sg_cpu->max); +				   map_util_perf(sg_cpu->util), max_cap);  	sg_cpu->sg_policy->last_freq_update_time = time;  } @@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)  {  	struct sugov_policy *sg_policy = sg_cpu->sg_policy;  	struct cpufreq_policy *policy = sg_policy->policy; -	unsigned long util = 0, max = 1; +	unsigned long util = 0, max_cap;  	unsigned int j; +	max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); +  	for_each_cpu(j, policy->cpus) {  		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); -		unsigned long j_util, j_max;  		sugov_get_util(j_sg_cpu); -		sugov_iowait_apply(j_sg_cpu, time); -		j_util = j_sg_cpu->util; -		j_max = j_sg_cpu->max; +		sugov_iowait_apply(j_sg_cpu, time, max_cap); -		if (j_util * max > j_max * util) { -			util = j_util; -			max = j_max; -		} +		util = max(j_sg_cpu->util, util);  	} -	return get_next_freq(sg_policy, util, max); +	return get_next_freq(sg_policy, util, max_cap);  }  static void @@ -543,7 +546,7 @@ static void sugov_tunables_free(struct kobject *kobj)  	kfree(to_sugov_tunables(attr_set));  } -static struct kobj_type sugov_tunables_ktype = { +static const struct kobj_type sugov_tunables_ktype = {  	.default_groups = sugov_groups,  	.sysfs_ops = &governor_sysfs_ops,  	.release = &sugov_tunables_free, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 95fc77853743..af7952f12e6c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,10 @@   * Simple CPU accounting cgroup controller   */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + #include <asm/cputime.h> +#endif +  #ifdef CONFIG_IRQ_TIME_ACCOUNTING  /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0d97d54276cc..71b24371a6f7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)  static void prio_changed_dl(struct rq *rq, struct task_struct *p,  			    int oldprio)  { -	if (task_on_rq_queued(p) || task_current(rq, p)) { +	if (!task_on_rq_queued(p)) +		return; +  #ifdef CONFIG_SMP -		/* -		 * This might be too much, but unfortunately -		 * we don't have the old deadline value, and -		 * we can't argue if the task is increasing -		 * or lowering its prio, so... -		 */ -		if (!rq->dl.overloaded) -			deadline_queue_pull_task(rq); +	/* +	 * This might be too much, but unfortunately +	 * we don't have the old deadline value, and +	 * we can't argue if the task is increasing +	 * or lowering its prio, so... +	 */ +	if (!rq->dl.overloaded) +		deadline_queue_pull_task(rq); +	if (task_current(rq, p)) {  		/*  		 * If we now have a earlier deadline task than p,  		 * then reschedule, provided p is still on this @@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,  		 */  		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))  			resched_curr(rq); -#else +	} else {  		/* -		 * Again, we don't know if p has a earlier -		 * or later deadline, so let's blindly set a -		 * (maybe not needed) rescheduling point. +		 * Current may not be deadline in case p was throttled but we +		 * have just replenished it (e.g. rt_mutex_setprio()). +		 * +		 * Otherwise, if p was given an earlier deadline, reschedule.  		 */ -		resched_curr(rq); -#endif /* CONFIG_SMP */ +		if (!dl_task(rq->curr) || +		    dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) +			resched_curr(rq);  	} +#else +	/* +	 * We don't know if p has a earlier or later deadline, so let's blindly +	 * set a (maybe not needed) rescheduling point. +	 */ +	resched_curr(rq); +#endif  }  DEFINE_SCHED_CLASS(dl) = { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c36aa54ae071..7a1b1f855b96 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -468,7 +468,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse)  	return NULL;  } -static inline struct sched_entity *parent_entity(struct sched_entity *se) +static inline struct sched_entity *parent_entity(const struct sched_entity *se)  {  	return se->parent;  } @@ -595,8 +595,8 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)  	return min_vruntime;  } -static inline bool entity_before(struct sched_entity *a, -				struct sched_entity *b) +static inline bool entity_before(const struct sched_entity *a, +				 const struct sched_entity *b)  {  	return (s64)(a->vruntime - b->vruntime) < 0;  } @@ -1804,7 +1804,7 @@ static void update_numa_stats(struct task_numa_env *env,  		ns->nr_running += rq->cfs.h_nr_running;  		ns->compute_capacity += capacity_of(cpu); -		if (find_idle && !rq->nr_running && idle_cpu(cpu)) { +		if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {  			if (READ_ONCE(rq->numa_migrate_on) ||  			    !cpumask_test_cpu(cpu, env->p->cpus_ptr))  				continue; @@ -1836,7 +1836,7 @@ static void task_numa_assign(struct task_numa_env *env,  		int start = env->dst_cpu;  		/* Find alternative idle CPU. */ -		for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { +		for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {  			if (cpu == env->best_cpu || !idle_cpu(cpu) ||  			    !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {  				continue; @@ -2938,11 +2938,11 @@ static void task_numa_work(struct callback_head *work)  	struct task_struct *p = current;  	struct mm_struct *mm = p->mm;  	u64 runtime = p->se.sum_exec_runtime; -	MA_STATE(mas, &mm->mm_mt, 0, 0);  	struct vm_area_struct *vma;  	unsigned long start, end;  	unsigned long nr_pte_updates = 0;  	long pages, virtpages; +	struct vma_iterator vmi;  	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -2995,16 +2995,16 @@ static void task_numa_work(struct callback_head *work)  	if (!mmap_read_trylock(mm))  		return; -	mas_set(&mas, start); -	vma = mas_find(&mas, ULONG_MAX); +	vma_iter_init(&vmi, mm, start); +	vma = vma_next(&vmi);  	if (!vma) {  		reset_ptenuma_scan(p);  		start = 0; -		mas_set(&mas, start); -		vma = mas_find(&mas, ULONG_MAX); +		vma_iter_set(&vmi, start); +		vma = vma_next(&vmi);  	} -	for (; vma; vma = mas_find(&mas, ULONG_MAX)) { +	do {  		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||  			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {  			continue; @@ -3051,7 +3051,7 @@ static void task_numa_work(struct callback_head *work)  			cond_resched();  		} while (end != vma->vm_end); -	} +	} for_each_vma(vmi, vma);  out:  	/* @@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,  	 *  	 * For uclamp_max, we can tolerate a drop in performance level as the  	 * goal is to cap the task. So it's okay if it's getting less. -	 * -	 * In case of capacity inversion we should honour the inverted capacity -	 * for both uclamp_min and uclamp_max all the time.  	 */ -	capacity_orig = cpu_in_capacity_inversion(cpu); -	if (capacity_orig) { -		capacity_orig_thermal = capacity_orig; -	} else { -		capacity_orig = capacity_orig_of(cpu); -		capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); -	} +	capacity_orig = capacity_orig_of(cpu); +	capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);  	/*  	 * We want to force a task to fit a cpu as implied by uclamp_max. @@ -4561,8 +4553,8 @@ static inline int util_fits_cpu(unsigned long util,  	 * handle the case uclamp_min > uclamp_max.  	 */  	uclamp_min = min(uclamp_min, uclamp_max); -	if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) -		fits = fits && (uclamp_min <= capacity_orig_thermal); +	if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) +		return -1;  	return fits;  } @@ -4572,7 +4564,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)  	unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);  	unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);  	unsigned long util = task_util_est(p); -	return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); +	/* +	 * Return true only if the cpu fully fits the task requirements, which +	 * include the utilization but also the performance hints. +	 */ +	return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);  }  static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -4656,6 +4652,7 @@ static void  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)  {  	u64 vruntime = cfs_rq->min_vruntime; +	u64 sleep_time;  	/*  	 * The 'current' period is already promised to the current tasks, @@ -4685,8 +4682,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)  		vruntime -= thresh;  	} -	/* ensure we never gain time by being placed backwards. */ -	se->vruntime = max_vruntime(se->vruntime, vruntime); +	/* +	 * Pull vruntime of the entity being placed to the base level of +	 * cfs_rq, to prevent boosting it if placed backwards.  If the entity +	 * slept for a long time, don't even try to compare its vruntime with +	 * the base as it may be too far off and the comparison may get +	 * inversed due to s64 overflow. +	 */ +	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; +	if ((s64)sleep_time > 60LL * NSEC_PER_SEC) +		se->vruntime = vruntime; +	else +		se->vruntime = max_vruntime(se->vruntime, vruntime);  }  static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -4896,7 +4903,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)  	struct sched_entity *se;  	s64 delta; -	ideal_runtime = sched_slice(cfs_rq, curr); +	/* +	 * When many tasks blow up the sched_period; it is possible that +	 * sched_slice() reports unusually large results (when many tasks are +	 * very light for example). Therefore impose a maximum. +	 */ +	ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); +  	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;  	if (delta_exec > ideal_runtime) {  		resched_curr(rq_of(cfs_rq)); @@ -5461,22 +5474,105 @@ unthrottle_throttle:  		resched_curr(rq);  } -static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +#ifdef CONFIG_SMP +static void __cfsb_csd_unthrottle(void *arg)  { -	struct cfs_rq *cfs_rq; +	struct cfs_rq *cursor, *tmp; +	struct rq *rq = arg; +	struct rq_flags rf; + +	rq_lock(rq, &rf); + +	/* +	 * Since we hold rq lock we're safe from concurrent manipulation of +	 * the CSD list. However, this RCU critical section annotates the +	 * fact that we pair with sched_free_group_rcu(), so that we cannot +	 * race with group being freed in the window between removing it +	 * from the list and advancing to the next entry in the list. +	 */ +	rcu_read_lock(); + +	list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, +				 throttled_csd_list) { +		list_del_init(&cursor->throttled_csd_list); + +		if (cfs_rq_throttled(cursor)) +			unthrottle_cfs_rq(cursor); +	} + +	rcu_read_unlock(); + +	rq_unlock(rq, &rf); +} + +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ +	struct rq *rq = rq_of(cfs_rq); +	bool first; + +	if (rq == this_rq()) { +		unthrottle_cfs_rq(cfs_rq); +		return; +	} + +	/* Already enqueued */ +	if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) +		return; + +	first = list_empty(&rq->cfsb_csd_list); +	list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); +	if (first) +		smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); +} +#else +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ +	unthrottle_cfs_rq(cfs_rq); +} +#endif + +static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ +	lockdep_assert_rq_held(rq_of(cfs_rq)); + +	if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || +	    cfs_rq->runtime_remaining <= 0)) +		return; + +	__unthrottle_cfs_rq_async(cfs_rq); +} + +static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +{ +	struct cfs_rq *local_unthrottle = NULL; +	int this_cpu = smp_processor_id();  	u64 runtime, remaining = 1; +	bool throttled = false; +	struct cfs_rq *cfs_rq; +	struct rq_flags rf; +	struct rq *rq;  	rcu_read_lock();  	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,  				throttled_list) { -		struct rq *rq = rq_of(cfs_rq); -		struct rq_flags rf; +		rq = rq_of(cfs_rq); + +		if (!remaining) { +			throttled = true; +			break; +		}  		rq_lock_irqsave(rq, &rf);  		if (!cfs_rq_throttled(cfs_rq))  			goto next; -		/* By the above check, this should never be true */ +#ifdef CONFIG_SMP +		/* Already queued for async unthrottle */ +		if (!list_empty(&cfs_rq->throttled_csd_list)) +			goto next; +#endif + +		/* By the above checks, this should never be true */  		SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);  		raw_spin_lock(&cfs_b->lock); @@ -5490,16 +5586,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)  		cfs_rq->runtime_remaining += runtime;  		/* we check whether we're throttled above */ -		if (cfs_rq->runtime_remaining > 0) -			unthrottle_cfs_rq(cfs_rq); +		if (cfs_rq->runtime_remaining > 0) { +			if (cpu_of(rq) != this_cpu || +			    SCHED_WARN_ON(local_unthrottle)) +				unthrottle_cfs_rq_async(cfs_rq); +			else +				local_unthrottle = cfs_rq; +		} else { +			throttled = true; +		}  next:  		rq_unlock_irqrestore(rq, &rf); - -		if (!remaining) -			break;  	}  	rcu_read_unlock(); + +	if (local_unthrottle) { +		rq = cpu_rq(this_cpu); +		rq_lock_irqsave(rq, &rf); +		if (cfs_rq_throttled(local_unthrottle)) +			unthrottle_cfs_rq(local_unthrottle); +		rq_unlock_irqrestore(rq, &rf); +	} + +	return throttled;  }  /* @@ -5544,10 +5654,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u  	while (throttled && cfs_b->runtime > 0) {  		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);  		/* we can't nest cfs_b->lock while distributing bandwidth */ -		distribute_cfs_runtime(cfs_b); +		throttled = distribute_cfs_runtime(cfs_b);  		raw_spin_lock_irqsave(&cfs_b->lock, flags); - -		throttled = !list_empty(&cfs_b->throttled_cfs_rq);  	}  	/* @@ -5824,6 +5932,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)  {  	cfs_rq->runtime_enabled = 0;  	INIT_LIST_HEAD(&cfs_rq->throttled_list); +#ifdef CONFIG_SMP +	INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); +#endif  }  void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5840,12 +5951,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  { +	int __maybe_unused i; +  	/* init_cfs_bandwidth() was not called */  	if (!cfs_b->throttled_cfs_rq.next)  		return;  	hrtimer_cancel(&cfs_b->period_timer);  	hrtimer_cancel(&cfs_b->slack_timer); + +	/* +	 * It is possible that we still have some cfs_rq's pending on a CSD +	 * list, though this race is very rare. In order for this to occur, we +	 * must have raced with the last task leaving the group while there +	 * exist throttled cfs_rq(s), and the period_timer must have queued the +	 * CSD item but the remote cpu has not yet processed it. To handle this, +	 * we can simply flush all pending CSD work inline here. We're +	 * guaranteed at this point that no additional cfs_rq of this group can +	 * join a CSD list. +	 */ +#ifdef CONFIG_SMP +	for_each_possible_cpu(i) { +		struct rq *rq = cpu_rq(i); +		unsigned long flags; + +		if (list_empty(&rq->cfsb_csd_list)) +			continue; + +		local_irq_save(flags); +		__cfsb_csd_unthrottle(rq); +		local_irq_restore(flags); +	} +#endif  }  /* @@ -6008,6 +6145,7 @@ static inline bool cpu_overutilized(int cpu)  	unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);  	unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); +	/* Return true only if the utilization doesn't fit CPU's capacity */  	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);  } @@ -6801,6 +6939,7 @@ static int  select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)  {  	unsigned long task_util, util_min, util_max, best_cap = 0; +	int fits, best_fits = 0;  	int cpu, best_cpu = -1;  	struct cpumask *cpus; @@ -6811,17 +6950,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)  	util_min = uclamp_eff_value(p, UCLAMP_MIN);  	util_max = uclamp_eff_value(p, UCLAMP_MAX); -	for_each_cpu_wrap(cpu, cpus, target) { +	for_each_cpu_wrap(cpu, cpus, target + 1) {  		unsigned long cpu_cap = capacity_of(cpu);  		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))  			continue; -		if (util_fits_cpu(task_util, util_min, util_max, cpu)) + +		fits = util_fits_cpu(task_util, util_min, util_max, cpu); + +		/* This CPU fits with all requirements */ +		if (fits > 0)  			return cpu; +		/* +		 * Only the min performance hint (i.e. uclamp_min) doesn't fit. +		 * Look for the CPU with best capacity. +		 */ +		else if (fits < 0) +			cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); -		if (cpu_cap > best_cap) { +		/* +		 * First, select CPU which fits better (-1 being better than 0). +		 * Then, select the one with best capacity at same level. +		 */ +		if ((fits < best_fits) || +		    ((fits == best_fits) && (cpu_cap > best_cap))) {  			best_cap = cpu_cap;  			best_cpu = cpu; +			best_fits = fits;  		}  	} @@ -6834,7 +6989,11 @@ static inline bool asym_fits_cpu(unsigned long util,  				 int cpu)  {  	if (sched_asym_cpucap_active()) -		return util_fits_cpu(util, util_min, util_max, cpu); +		/* +		 * Return true only if the cpu fully fits the task requirements +		 * which include the utilization and the performance hints. +		 */ +		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);  	return true;  } @@ -7201,6 +7360,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  	unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;  	struct root_domain *rd = this_rq()->rd;  	int cpu, best_energy_cpu, target = -1; +	int prev_fits = -1, best_fits = -1; +	unsigned long best_thermal_cap = 0; +	unsigned long prev_thermal_cap = 0;  	struct sched_domain *sd;  	struct perf_domain *pd;  	struct energy_env eenv; @@ -7229,13 +7391,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  	eenv_task_busy_time(&eenv, p, prev_cpu);  	for (; pd; pd = pd->next) { +		unsigned long util_min = p_util_min, util_max = p_util_max;  		unsigned long cpu_cap, cpu_thermal_cap, util;  		unsigned long cur_delta, max_spare_cap = 0;  		unsigned long rq_util_min, rq_util_max; -		unsigned long util_min, util_max;  		unsigned long prev_spare_cap = 0;  		int max_spare_cap_cpu = -1;  		unsigned long base_energy; +		int fits, max_fits = -1;  		cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); @@ -7251,6 +7414,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  		eenv.pd_cap = 0;  		for_each_cpu(cpu, cpus) { +			struct rq *rq = cpu_rq(cpu); +  			eenv.pd_cap += cpu_thermal_cap;  			if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) @@ -7269,26 +7434,23 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  			 * much capacity we can get out of the CPU; this is  			 * aligned with sched_cpu_util().  			 */ -			if (uclamp_is_used()) { -				if (uclamp_rq_is_idle(cpu_rq(cpu))) { -					util_min = p_util_min; -					util_max = p_util_max; -				} else { -					/* -					 * Open code uclamp_rq_util_with() except for -					 * the clamp() part. Ie: apply max aggregation -					 * only. util_fits_cpu() logic requires to -					 * operate on non clamped util but must use the -					 * max-aggregated uclamp_{min, max}. -					 */ -					rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); -					rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); - -					util_min = max(rq_util_min, p_util_min); -					util_max = max(rq_util_max, p_util_max); -				} +			if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) { +				/* +				 * Open code uclamp_rq_util_with() except for +				 * the clamp() part. Ie: apply max aggregation +				 * only. util_fits_cpu() logic requires to +				 * operate on non clamped util but must use the +				 * max-aggregated uclamp_{min, max}. +				 */ +				rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN); +				rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX); + +				util_min = max(rq_util_min, p_util_min); +				util_max = max(rq_util_max, p_util_max);  			} -			if (!util_fits_cpu(util, util_min, util_max, cpu)) + +			fits = util_fits_cpu(util, util_min, util_max, cpu); +			if (!fits)  				continue;  			lsub_positive(&cpu_cap, util); @@ -7296,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  			if (cpu == prev_cpu) {  				/* Always use prev_cpu as a candidate. */  				prev_spare_cap = cpu_cap; -			} else if (cpu_cap > max_spare_cap) { +				prev_fits = fits; +			} else if ((fits > max_fits) || +				   ((fits == max_fits) && (cpu_cap > max_spare_cap))) {  				/*  				 * Find the CPU with the maximum spare capacity  				 * among the remaining CPUs in the performance @@ -7304,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  				 */  				max_spare_cap = cpu_cap;  				max_spare_cap_cpu = cpu; +				max_fits = fits;  			}  		} @@ -7322,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)  			if (prev_delta < base_energy)  				goto unlock;  			prev_delta -= base_energy; +			prev_thermal_cap = cpu_thermal_cap;  			best_delta = min(best_delta, prev_delta);  		}  		/* Evaluate the energy impact of using max_spare_cap_cpu. */  		if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { +			/* Current best energy cpu fits better */ +			if (max_fits < best_fits) +				continue; + +			/* +			 * Both don't fit performance hint (i.e. uclamp_min) +			 * but best energy cpu has better capacity. +			 */ +			if ((max_fits < 0) && +			    (cpu_thermal_cap <= best_thermal_cap)) +				continue; +  			cur_delta = compute_energy(&eenv, pd, cpus, p,  						   max_spare_cap_cpu);  			/* CPU utilization has changed */  			if (cur_delta < base_energy)  				goto unlock;  			cur_delta -= base_energy; -			if (cur_delta < best_delta) { -				best_delta = cur_delta; -				best_energy_cpu = max_spare_cap_cpu; -			} + +			/* +			 * Both fit for the task but best energy cpu has lower +			 * energy impact. +			 */ +			if ((max_fits > 0) && (best_fits > 0) && +			    (cur_delta >= best_delta)) +				continue; + +			best_delta = cur_delta; +			best_energy_cpu = max_spare_cap_cpu; +			best_fits = max_fits; +			best_thermal_cap = cpu_thermal_cap;  		}  	}  	rcu_read_unlock(); -	if (best_delta < prev_delta) +	if ((best_fits > prev_fits) || +	    ((best_fits > 0) && (best_delta < prev_delta)) || +	    ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))  		target = best_energy_cpu;  	return target; @@ -8841,73 +9030,16 @@ static unsigned long scale_rt_capacity(int cpu)  static void update_cpu_capacity(struct sched_domain *sd, int cpu)  { -	unsigned long capacity_orig = arch_scale_cpu_capacity(cpu);  	unsigned long capacity = scale_rt_capacity(cpu);  	struct sched_group *sdg = sd->groups; -	struct rq *rq = cpu_rq(cpu); -	rq->cpu_capacity_orig = capacity_orig; +	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);  	if (!capacity)  		capacity = 1; -	rq->cpu_capacity = capacity; - -	/* -	 * Detect if the performance domain is in capacity inversion state. -	 * -	 * Capacity inversion happens when another perf domain with equal or -	 * lower capacity_orig_of() ends up having higher capacity than this -	 * domain after subtracting thermal pressure. -	 * -	 * We only take into account thermal pressure in this detection as it's -	 * the only metric that actually results in *real* reduction of -	 * capacity due to performance points (OPPs) being dropped/become -	 * unreachable due to thermal throttling. -	 * -	 * We assume: -	 *   * That all cpus in a perf domain have the same capacity_orig -	 *     (same uArch). -	 *   * Thermal pressure will impact all cpus in this perf domain -	 *     equally. -	 */ -	if (static_branch_unlikely(&sched_asym_cpucapacity)) { -		unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); -		struct perf_domain *pd = rcu_dereference(rq->rd->pd); - -		rq->cpu_capacity_inverted = 0; - -		for (; pd; pd = pd->next) { -			struct cpumask *pd_span = perf_domain_span(pd); -			unsigned long pd_cap_orig, pd_cap; - -			cpu = cpumask_any(pd_span); -			pd_cap_orig = arch_scale_cpu_capacity(cpu); - -			if (capacity_orig < pd_cap_orig) -				continue; - -			/* -			 * handle the case of multiple perf domains have the -			 * same capacity_orig but one of them is under higher -			 * thermal pressure. We record it as capacity -			 * inversion. -			 */ -			if (capacity_orig == pd_cap_orig) { -				pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu)); - -				if (pd_cap > inv_cap) { -					rq->cpu_capacity_inverted = inv_cap; -					break; -				} -			} else if (pd_cap_orig > inv_cap) { -				rq->cpu_capacity_inverted = inv_cap; -				break; -			} -		} -	} - -	trace_sched_cpu_capacity_tp(rq); +	cpu_rq(cpu)->cpu_capacity = capacity; +	trace_sched_cpu_capacity_tp(cpu_rq(cpu));  	sdg->sgc->capacity = capacity;  	sdg->sgc->min_capacity = capacity; @@ -10135,24 +10267,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)  	 */  	update_sd_lb_stats(env, &sds); -	if (sched_energy_enabled()) { -		struct root_domain *rd = env->dst_rq->rd; - -		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) -			goto out_balanced; -	} - -	local = &sds.local_stat; -	busiest = &sds.busiest_stat; -  	/* There is no busy sibling group to pull tasks from */  	if (!sds.busiest)  		goto out_balanced; +	busiest = &sds.busiest_stat; +  	/* Misfit tasks should be dealt with regardless of the avg load */  	if (busiest->group_type == group_misfit_task)  		goto force_balance; +	if (sched_energy_enabled()) { +		struct root_domain *rd = env->dst_rq->rd; + +		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) +			goto out_balanced; +	} +  	/* ASYM feature bypasses nice load balance check */  	if (busiest->group_type == group_asym_packing)  		goto force_balance; @@ -10165,6 +10296,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)  	if (busiest->group_type == group_imbalanced)  		goto force_balance; +	local = &sds.local_stat;  	/*  	 * If the local group is busier than the selected busiest group  	 * don't try and pull any tasks. @@ -11728,7 +11860,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)  /*   * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.   */ -static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle) +static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, +			 bool forceidle)  {  	for_each_sched_entity(se) {  		struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -11753,11 +11886,12 @@ void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)  	se_fi_update(se, rq->core->core_forceidle_seq, in_fi);  } -bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, +			bool in_fi)  {  	struct rq *rq = task_rq(a); -	struct sched_entity *sea = &a->se; -	struct sched_entity *seb = &b->se; +	const struct sched_entity *sea = &a->se; +	const struct sched_entity *seb = &b->se;  	struct cfs_rq *cfs_rqa;  	struct cfs_rq *cfs_rqb;  	s64 delta; @@ -12474,6 +12608,11 @@ __init void init_sched_fair_class(void)  	for_each_possible_cpu(i) {  		zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));  		zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i)); + +#ifdef CONFIG_CFS_BANDWIDTH +		INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); +		INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list); +#endif  	}  	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f26ab2675f7d..e9ef66be2870 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -51,18 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup);  static noinline int __cpuidle cpu_idle_poll(void)  { +	instrumentation_begin();  	trace_cpu_idle(0, smp_processor_id());  	stop_critical_timings(); -	ct_idle_enter(); -	local_irq_enable(); +	ct_cpuidle_enter(); +	raw_local_irq_enable();  	while (!tif_need_resched() &&  	       (cpu_idle_force_poll || tick_check_broadcast_expired()))  		cpu_relax(); +	raw_local_irq_disable(); -	ct_idle_exit(); +	ct_cpuidle_exit();  	start_critical_timings();  	trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); +	local_irq_enable(); +	instrumentation_end();  	return 1;  } @@ -75,7 +79,6 @@ void __weak arch_cpu_idle_dead(void) { }  void __weak arch_cpu_idle(void)  {  	cpu_idle_force_poll = 1; -	raw_local_irq_enable();  }  /** @@ -85,44 +88,20 @@ void __weak arch_cpu_idle(void)   */  void __cpuidle default_idle_call(void)  { -	if (current_clr_polling_and_test()) { -		local_irq_enable(); -	} else { - +	instrumentation_begin(); +	if (!current_clr_polling_and_test()) {  		trace_cpu_idle(1, smp_processor_id());  		stop_critical_timings(); -		/* -		 * arch_cpu_idle() is supposed to enable IRQs, however -		 * we can't do that because of RCU and tracing. -		 * -		 * Trace IRQs enable here, then switch off RCU, and have -		 * arch_cpu_idle() use raw_local_irq_enable(). Note that -		 * ct_idle_enter() relies on lockdep IRQ state, so switch that -		 * last -- this is very similar to the entry code. -		 */ -		trace_hardirqs_on_prepare(); -		lockdep_hardirqs_on_prepare(); -		ct_idle_enter(); -		lockdep_hardirqs_on(_THIS_IP_); - +		ct_cpuidle_enter();  		arch_cpu_idle(); - -		/* -		 * OK, so IRQs are enabled here, but RCU needs them disabled to -		 * turn itself back on.. funny thing is that disabling IRQs -		 * will cause tracing, which needs RCU. Jump through hoops to -		 * make it 'work'. -		 */ -		raw_local_irq_disable(); -		lockdep_hardirqs_off(_THIS_IP_); -		ct_idle_exit(); -		lockdep_hardirqs_on(_THIS_IP_); -		raw_local_irq_enable(); +		ct_cpuidle_exit();  		start_critical_timings();  		trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());  	} +	local_irq_enable(); +	instrumentation_end();  }  static int call_cpuidle_s2idle(struct cpuidle_driver *drv, diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 0c5be7ebb1dc..2ad881d07752 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -159,7 +159,8 @@  	| MEMBARRIER_CMD_PRIVATE_EXPEDITED				\  	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\  	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK		\ -	| MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK) +	| MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK			\ +	| MEMBARRIER_CMD_GET_REGISTRATIONS)  static void ipi_mb(void *info)  { @@ -540,6 +541,40 @@ static int membarrier_register_private_expedited(int flags)  	return 0;  } +static int membarrier_get_registrations(void) +{ +	struct task_struct *p = current; +	struct mm_struct *mm = p->mm; +	int registrations_mask = 0, membarrier_state, i; +	static const int states[] = { +		MEMBARRIER_STATE_GLOBAL_EXPEDITED | +			MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, +		MEMBARRIER_STATE_PRIVATE_EXPEDITED | +			MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, +		MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE | +			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY, +		MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ | +			MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY +	}; +	static const int registration_cmds[] = { +		MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, +		MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, +		MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, +		MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ +	}; +	BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds)); + +	membarrier_state = atomic_read(&mm->membarrier_state); +	for (i = 0; i < ARRAY_SIZE(states); ++i) { +		if (membarrier_state & states[i]) { +			registrations_mask |= registration_cmds[i]; +			membarrier_state &= ~states[i]; +		} +	} +	WARN_ON_ONCE(membarrier_state != 0); +	return registrations_mask; +} +  /**   * sys_membarrier - issue memory barriers on a set of threads   * @cmd:    Takes command values defined in enum membarrier_cmd. @@ -623,6 +658,8 @@ SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)  		return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);  	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:  		return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); +	case MEMBARRIER_CMD_GET_REGISTRATIONS: +		return membarrier_get_registrations();  	default:  		return -EINVAL;  	} diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8ac8b81bfee6..02e011cabe91 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1343,10 +1343,11 @@ void psi_trigger_destroy(struct psi_trigger *t)  	group = t->group;  	/* -	 * Wakeup waiters to stop polling. Can happen if cgroup is deleted -	 * from under a polling process. +	 * Wakeup waiters to stop polling and clear the queue to prevent it from +	 * being accessed later. Can happen if cgroup is deleted from under a +	 * polling process.  	 */ -	wake_up_interruptible(&t->event_wait); +	wake_up_pollfree(&t->event_wait);  	mutex_lock(&group->trigger_lock); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ed2a47e4ddae..0a11f44adee5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1777,6 +1777,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)  	BUG_ON(idx >= MAX_RT_PRIO);  	queue = array->queue + idx; +	if (SCHED_WARN_ON(list_empty(queue))) +		return NULL;  	next = list_entry(queue->next, struct sched_rt_entity, run_list);  	return next; @@ -1789,7 +1791,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)  	do {  		rt_se = pick_next_rt_entity(rt_rq); -		BUG_ON(!rt_se); +		if (unlikely(!rt_se)) +			return NULL;  		rt_rq = group_rt_rq(rt_se);  	} while (rt_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 771f8ddb7053..3e8df6d31c1e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -248,7 +248,7 @@ static inline void update_avg(u64 *avg, u64 sample)  #define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) -static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se)  {  #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL  	return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); @@ -260,8 +260,8 @@ static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)  /*   * Tells if entity @a should preempt entity @b.   */ -static inline bool -dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +static inline bool dl_entity_preempt(const struct sched_dl_entity *a, +				     const struct sched_dl_entity *b)  {  	return dl_entity_is_special(a) ||  	       dl_time_before(a->deadline, b->deadline); @@ -645,6 +645,9 @@ struct cfs_rq {  	int			throttled;  	int			throttle_count;  	struct list_head	throttled_list; +#ifdef CONFIG_SMP +	struct list_head	throttled_csd_list; +#endif  #endif /* CONFIG_CFS_BANDWIDTH */  #endif /* CONFIG_FAIR_GROUP_SCHED */  }; @@ -1041,7 +1044,6 @@ struct rq {  	unsigned long		cpu_capacity;  	unsigned long		cpu_capacity_orig; -	unsigned long		cpu_capacity_inverted;  	struct balance_callback *balance_callback; @@ -1154,6 +1156,11 @@ struct rq {  	/* Scratch cpumask to be temporarily used under rq_lock */  	cpumask_var_t		scratch_mask; + +#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) +	call_single_data_t	cfsb_csd; +	struct list_head	cfsb_csd_list; +#endif  };  #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1236,7 +1243,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)  	return &rq->__lock;  } -bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, +			bool fi);  /*   * Helpers to check if the CPU's core cookie matches with the task's cookie @@ -1415,7 +1423,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)  }  /* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)  {  	return se->cfs_rq;  } @@ -1428,19 +1436,16 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)  #else -static inline struct task_struct *task_of(struct sched_entity *se) -{ -	return container_of(se, struct task_struct, se); -} +#define task_of(_se)	container_of(_se, struct task_struct, se) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p)  {  	return &task_rq(p)->cfs;  } -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se)  { -	struct task_struct *p = task_of(se); +	const struct task_struct *p = task_of(se);  	struct rq *rq = task_rq(p);  	return &rq->cfs; @@ -2893,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu)  	return cpu_rq(cpu)->cpu_capacity_orig;  } -/* - * Returns inverted capacity if the CPU is in capacity inversion state. - * 0 otherwise. - * - * Capacity inversion detection only considers thermal impact where actual - * performance points (OPPs) gets dropped. - * - * Capacity inversion state happens when another performance domain that has - * equal or lower capacity_orig_of() becomes effectively larger than the perf - * domain this CPU belongs to due to thermal pressure throttling it hard. - * - * See comment in update_cpu_capacity(). - */ -static inline unsigned long cpu_in_capacity_inversion(int cpu) -{ -	return cpu_rq(cpu)->cpu_capacity_inverted; -} -  /**   * enum cpu_util_type - CPU utilization type   * @FREQUENCY_UTIL:	Utilization used to select frequency @@ -3261,4 +3248,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr,  	cgroup_account_cputime(curr, delta_exec);  } +#ifdef CONFIG_SCHED_MM_CID +static inline int __mm_cid_get(struct mm_struct *mm) +{ +	struct cpumask *cpumask; +	int cid; + +	cpumask = mm_cidmask(mm); +	cid = cpumask_first_zero(cpumask); +	if (cid >= nr_cpu_ids) +		return -1; +	__cpumask_set_cpu(cid, cpumask); +	return cid; +} + +static inline void mm_cid_put(struct mm_struct *mm, int cid) +{ +	lockdep_assert_irqs_disabled(); +	if (cid < 0) +		return; +	raw_spin_lock(&mm->cid_lock); +	__cpumask_clear_cpu(cid, mm_cidmask(mm)); +	raw_spin_unlock(&mm->cid_lock); +} + +static inline int mm_cid_get(struct mm_struct *mm) +{ +	int ret; + +	lockdep_assert_irqs_disabled(); +	raw_spin_lock(&mm->cid_lock); +	ret = __mm_cid_get(mm); +	raw_spin_unlock(&mm->cid_lock); +	return ret; +} + +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) +{ +	if (prev->mm_cid_active) { +		if (next->mm_cid_active && next->mm == prev->mm) { +			/* +			 * Context switch between threads in same mm, hand over +			 * the mm_cid from prev to next. +			 */ +			next->mm_cid = prev->mm_cid; +			prev->mm_cid = -1; +			return; +		} +		mm_cid_put(prev->mm, prev->mm_cid); +		prev->mm_cid = -1; +	} +	if (next->mm_cid_active) +		next->mm_cid = mm_cid_get(next->mm); +} + +#else +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } +#endif +  #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8739c2a5a54e..051aaf65c749 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,6 +3,8 @@   * Scheduler topology setup/handling methods   */ +#include <linux/bsearch.h> +  DEFINE_MUTEX(sched_domains_mutex);  /* Protected by sched_domains_mutex: */ @@ -578,7 +580,7 @@ out:   */  struct root_domain def_root_domain; -void init_defrootdomain(void) +void __init init_defrootdomain(void)  {  	init_rootdomain(&def_root_domain); @@ -2067,6 +2069,99 @@ unlock:  	return found;  } +struct __cmp_key { +	const struct cpumask *cpus; +	struct cpumask ***masks; +	int node; +	int cpu; +	int w; +}; + +static int hop_cmp(const void *a, const void *b) +{ +	struct cpumask **prev_hop, **cur_hop = *(struct cpumask ***)b; +	struct __cmp_key *k = (struct __cmp_key *)a; + +	if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu) +		return 1; + +	if (b == k->masks) { +		k->w = 0; +		return 0; +	} + +	prev_hop = *((struct cpumask ***)b - 1); +	k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]); +	if (k->w <= k->cpu) +		return 0; + +	return -1; +} + +/* + * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu + *                             closest to @cpu from @cpumask. + * cpumask: cpumask to find a cpu from + * cpu: Nth cpu to find + * + * returns: cpu, or nr_cpu_ids when nothing found. + */ +int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) +{ +	struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu }; +	struct cpumask ***hop_masks; +	int hop, ret = nr_cpu_ids; + +	rcu_read_lock(); + +	k.masks = rcu_dereference(sched_domains_numa_masks); +	if (!k.masks) +		goto unlock; + +	hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp); +	hop = hop_masks	- k.masks; + +	ret = hop ? +		cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) : +		cpumask_nth_and(cpu, cpus, k.masks[0][node]); +unlock: +	rcu_read_unlock(); +	return ret; +} +EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu); + +/** + * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from + *                         @node + * @node: The node to count hops from. + * @hops: Include CPUs up to that many hops away. 0 means local node. + * + * Return: On success, a pointer to a cpumask of CPUs at most @hops away from + * @node, an error value otherwise. + * + * Requires rcu_lock to be held. Returned cpumask is only valid within that + * read-side section, copy it if required beyond that. + * + * Note that not all hops are equal in distance; see sched_init_numa() for how + * distances and masks are handled. + * Also note that this is a reflection of sched_domains_numa_masks, which may change + * during the lifetime of the system (offline nodes are taken out of the masks). + */ +const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops) +{ +	struct cpumask ***masks; + +	if (node >= nr_node_ids || hops >= sched_domains_numa_levels) +		return ERR_PTR(-EINVAL); + +	masks = rcu_dereference(sched_domains_numa_masks); +	if (!masks) +		return ERR_PTR(-EBUSY); + +	return masks[hops][node]; +} +EXPORT_SYMBOL_GPL(sched_numa_hop_mask); +  #endif /* CONFIG_NUMA */  static int __sdt_alloc(const struct cpumask *cpu_map) @@ -2451,7 +2546,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)   * Set up scheduler domains and groups.  For now this just excludes isolated   * CPUs, but could be used to exclude other special cases in the future.   */ -int sched_init_domains(const struct cpumask *cpu_map) +int __init sched_init_domains(const struct cpumask *cpu_map)  {  	int err; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e9852d1b4a5e..cebf26445f9e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -388,6 +388,7 @@ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilte  }  #endif /* SECCOMP_ARCH_NATIVE */ +#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))  /**   * seccomp_run_filters - evaluates all seccomp filters against @sd   * @sd: optional seccomp data to be passed to filters @@ -397,7 +398,6 @@ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilte   *   * Returns valid seccomp BPF response codes.   */ -#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))  static u32 seccomp_run_filters(const struct seccomp_data *sd,  			       struct seccomp_filter **match)  { diff --git a/kernel/signal.c b/kernel/signal.c index ae26da61c4d9..8cb28f1df294 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2951,6 +2951,7 @@ void exit_signals(struct task_struct *tsk)  	cgroup_threadgroup_change_begin(tsk);  	if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { +		sched_mm_cid_exit_signals(tsk);  		tsk->flags |= PF_EXITING;  		cgroup_threadgroup_change_end(tsk);  		return; @@ -2961,6 +2962,7 @@ void exit_signals(struct task_struct *tsk)  	 * From now this task is not visible for group-wide signals,  	 * see wants_signal(), do_signal_stop().  	 */ +	sched_mm_cid_exit_signals(tsk);  	tsk->flags |= PF_EXITING;  	cgroup_threadgroup_change_end(tsk); diff --git a/kernel/sys.c b/kernel/sys.c index 5fd54bf0e886..495cd87d9bf4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1442,6 +1442,8 @@ static int do_prlimit(struct task_struct *tsk, unsigned int resource,  	if (resource >= RLIM_NLIMITS)  		return -EINVAL; +	resource = array_index_nospec(resource, RLIM_NLIMITS); +  	if (new_rlim) {  		if (new_rlim->rlim_cur > new_rlim->rlim_max)  			return -EINVAL; @@ -2348,6 +2350,33 @@ static int prctl_set_vma(unsigned long opt, unsigned long start,  }  #endif /* CONFIG_ANON_VMA_NAME */ +static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, +				 unsigned long arg4, unsigned long arg5) +{ +	if (arg3 || arg4 || arg5) +		return -EINVAL; + +	if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) +		return -EINVAL; + +	if (bits & PR_MDWE_REFUSE_EXEC_GAIN) +		set_bit(MMF_HAS_MDWE, ¤t->mm->flags); +	else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) +		return -EPERM; /* Cannot unset the flag */ + +	return 0; +} + +static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, +				 unsigned long arg4, unsigned long arg5) +{ +	if (arg2 || arg3 || arg4 || arg5) +		return -EINVAL; + +	return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? +		PR_MDWE_REFUSE_EXEC_GAIN : 0; +} +  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  		unsigned long, arg4, unsigned long, arg5)  { @@ -2623,6 +2652,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  		error = sched_core_share_pid(arg2, arg3, arg4, arg5);  		break;  #endif +	case PR_SET_MDWE: +		error = prctl_set_mdwe(arg2, arg3, arg4, arg5); +		break; +	case PR_GET_MDWE: +		error = prctl_get_mdwe(arg2, arg3, arg4, arg5); +		break;  	case PR_SET_VMA:  		error = prctl_set_vma(arg2, arg3, arg4, arg5);  		break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 137d4abe3eda..1c240d2c99bc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -425,21 +425,6 @@ static void proc_put_char(void **buf, size_t *size, char c)  	}  } -static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp, -				int *valp, -				int write, void *data) -{ -	if (write) { -		*(bool *)valp = *lvalp; -	} else { -		int val = *(bool *)valp; - -		*lvalp = (unsigned long)val; -		*negp = false; -	} -	return 0; -} -  static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,  				 int *valp,  				 int write, void *data) @@ -710,16 +695,36 @@ int do_proc_douintvec(struct ctl_table *table, int write,   * @lenp: the size of the user buffer   * @ppos: file position   * - * Reads/writes up to table->maxlen/sizeof(unsigned int) integer - * values from/to the user buffer, treated as an ASCII string. + * Reads/writes one integer value from/to the user buffer, + * treated as an ASCII string. + * + * table->data must point to a bool variable and table->maxlen must + * be sizeof(bool).   *   * Returns 0 on success.   */  int proc_dobool(struct ctl_table *table, int write, void *buffer,  		size_t *lenp, loff_t *ppos)  { -	return do_proc_dointvec(table, write, buffer, lenp, ppos, -				do_proc_dobool_conv, NULL); +	struct ctl_table tmp; +	bool *data = table->data; +	int res, val; + +	/* Do not support arrays yet. */ +	if (table->maxlen != sizeof(bool)) +		return -EINVAL; + +	tmp = *table; +	tmp.maxlen = sizeof(val); +	tmp.data = &val; + +	val = READ_ONCE(*data); +	res = proc_dointvec(&tmp, write, buffer, lenp, ppos); +	if (res) +		return res; +	if (write) +		WRITE_ONCE(*data, val); +	return 0;  }  /** diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index a41753be1a2b..bae8f11070be 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -200,10 +200,14 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US  	int "Clocksource watchdog maximum allowable skew (in μs)"  	depends on CLOCKSOURCE_WATCHDOG  	range 50 1000 -	default 100 +	default 125  	help  	  Specify the maximum amount of allowable watchdog skew in  	  microseconds before reporting the clocksource to be unstable. +	  The default is based on a half-second clocksource watchdog +	  interval and NTP's maximum frequency drift of 500 parts +	  per million.	If the clocksource is good enough for NTP, +	  it is good enough for the clocksource watchdog!  endmenu  endif diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 5897828b9d7e..7e5dff602585 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -470,11 +470,35 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)  }  EXPORT_SYMBOL_GPL(alarm_forward); -u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throttle)  {  	struct alarm_base *base = &alarm_bases[alarm->type]; +	ktime_t now = base->get_ktime(); + +	if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && throttle) { +		/* +		 * Same issue as with posix_timer_fn(). Timers which are +		 * periodic but the signal is ignored can starve the system +		 * with a very small interval. The real fix which was +		 * promised in the context of posix_timer_fn() never +		 * materialized, but someone should really work on it. +		 * +		 * To prevent DOS fake @now to be 1 jiffie out which keeps +		 * the overrun accounting correct but creates an +		 * inconsistency vs. timer_gettime(2). +		 */ +		ktime_t kj = NSEC_PER_SEC / HZ; + +		if (interval < kj) +			now = ktime_add(now, kj); +	} + +	return alarm_forward(alarm, now, interval); +} -	return alarm_forward(alarm, base->get_ktime(), interval); +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ +	return __alarm_forward_now(alarm, interval, false);  }  EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -551,9 +575,10 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,  	if (posix_timer_event(ptr, si_private) && ptr->it_interval) {  		/*  		 * Handle ignored signals and rearm the timer. This will go -		 * away once we handle ignored signals proper. +		 * away once we handle ignored signals proper. Ensure that +		 * small intervals cannot starve the system.  		 */ -		ptr->it_overrun += alarm_forward_now(alarm, ptr->it_interval); +		ptr->it_overrun += __alarm_forward_now(alarm, ptr->it_interval, true);  		++ptr->it_requeue_pending;  		ptr->it_active = 1;  		result = ALARMTIMER_RESTART; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 9cf32ccda715..91836b727cef 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -96,6 +96,11 @@ static int finished_booting;  static u64 suspend_start;  /* + * Interval: 0.5sec. + */ +#define WATCHDOG_INTERVAL (HZ >> 1) + +/*   * Threshold: 0.0312s, when doubled: 0.0625s.   * Also a default for cs->uncertainty_margin when registering clocks.   */ @@ -106,11 +111,14 @@ static u64 suspend_start;   * clocksource surrounding a read of the clocksource being validated.   * This delay could be due to SMIs, NMIs, or to VCPU preemptions.  Used as   * a lower bound for cs->uncertainty_margin values when registering clocks. + * + * The default of 500 parts per million is based on NTP's limits. + * If a clocksource is good enough for NTP, it is good enough for us!   */  #ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US  #define MAX_SKEW_USEC	CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US  #else -#define MAX_SKEW_USEC	100 +#define MAX_SKEW_USEC	(125 * WATCHDOG_INTERVAL / HZ)  #endif  #define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) @@ -140,11 +148,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)  static int clocksource_watchdog_kthread(void *data);  static void __clocksource_change_rating(struct clocksource *cs, int rating); -/* - * Interval: 0.5sec. - */ -#define WATCHDOG_INTERVAL (HZ >> 1) -  static void clocksource_watchdog_work(struct work_struct *work)  {  	/* @@ -257,8 +260,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,  			goto skip_test;  	} -	pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n", -		smp_processor_id(), watchdog->name, wd_delay, nretries); +	pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n", +		smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);  	return WD_READ_UNSTABLE;  skip_test: @@ -384,6 +387,15 @@ void clocksource_verify_percpu(struct clocksource *cs)  }  EXPORT_SYMBOL_GPL(clocksource_verify_percpu); +static inline void clocksource_reset_watchdog(void) +{ +	struct clocksource *cs; + +	list_for_each_entry(cs, &watchdog_list, wd_list) +		cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + +  static void clocksource_watchdog(struct timer_list *unused)  {  	u64 csnow, wdnow, cslast, wdlast, delta; @@ -391,6 +403,7 @@ static void clocksource_watchdog(struct timer_list *unused)  	int64_t wd_nsec, cs_nsec;  	struct clocksource *cs;  	enum wd_read_status read_ret; +	unsigned long extra_wait = 0;  	u32 md;  	spin_lock(&watchdog_lock); @@ -410,13 +423,30 @@ static void clocksource_watchdog(struct timer_list *unused)  		read_ret = cs_watchdog_read(cs, &csnow, &wdnow); -		if (read_ret != WD_READ_SUCCESS) { -			if (read_ret == WD_READ_UNSTABLE) -				/* Clock readout unreliable, so give it up. */ -				__clocksource_unstable(cs); +		if (read_ret == WD_READ_UNSTABLE) { +			/* Clock readout unreliable, so give it up. */ +			__clocksource_unstable(cs);  			continue;  		} +		/* +		 * When WD_READ_SKIP is returned, it means the system is likely +		 * under very heavy load, where the latency of reading +		 * watchdog/clocksource is very big, and affect the accuracy of +		 * watchdog check. So give system some space and suspend the +		 * watchdog check for 5 minutes. +		 */ +		if (read_ret == WD_READ_SKIP) { +			/* +			 * As the watchdog timer will be suspended, and +			 * cs->last could keep unchanged for 5 minutes, reset +			 * the counters. +			 */ +			clocksource_reset_watchdog(); +			extra_wait = HZ * 300; +			break; +		} +  		/* Clocksource initialized ? */  		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||  		    atomic_read(&watchdog_reset_pending)) { @@ -443,12 +473,20 @@ static void clocksource_watchdog(struct timer_list *unused)  		/* Check the deviation from the watchdog clocksource. */  		md = cs->uncertainty_margin + watchdog->uncertainty_margin;  		if (abs(cs_nsec - wd_nsec) > md) { +			u64 cs_wd_msec; +			u64 wd_msec; +			u32 wd_rem; +  			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",  				smp_processor_id(), cs->name);  			pr_warn("                      '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",  				watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);  			pr_warn("                      '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",  				cs->name, cs_nsec, csnow, cslast, cs->mask); +			cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem); +			wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem); +			pr_warn("                      Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", +				cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);  			if (curr_clocksource == cs)  				pr_warn("                      '%s' is current clocksource.\n", cs->name);  			else if (curr_clocksource) @@ -512,7 +550,7 @@ static void clocksource_watchdog(struct timer_list *unused)  	 * pair clocksource_stop_watchdog() clocksource_start_watchdog().  	 */  	if (!timer_pending(&watchdog_timer)) { -		watchdog_timer.expires += WATCHDOG_INTERVAL; +		watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;  		add_timer_on(&watchdog_timer, next_cpu);  	}  out: @@ -537,14 +575,6 @@ static inline void clocksource_stop_watchdog(void)  	watchdog_running = 0;  } -static inline void clocksource_reset_watchdog(void) -{ -	struct clocksource *cs; - -	list_for_each_entry(cs, &watchdog_list, wd_list) -		cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -} -  static void clocksource_resume_watchdog(void)  {  	atomic_inc(&watchdog_reset_pending); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3ae661ab6260..e8c08292defc 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2089,7 +2089,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,  	u64 slack;  	slack = current->timer_slack_ns; -	if (dl_task(current) || rt_task(current)) +	if (rt_task(current))  		slack = 0;  	hrtimer_init_sleeper_on_stack(&t, clockid, mode); @@ -2126,6 +2126,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,  	if (!timespec64_valid(&tu))  		return -EINVAL; +	current->restart_block.fn = do_no_restart_syscall;  	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;  	current->restart_block.nanosleep.rmtp = rmtp;  	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, @@ -2147,6 +2148,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,  	if (!timespec64_valid(&tu))  		return -EINVAL; +	current->restart_block.fn = do_no_restart_syscall;  	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;  	current->restart_block.nanosleep.compat_rmtp = rmtp;  	return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, @@ -2270,7 +2272,7 @@ void __init hrtimers_init(void)  /**   * schedule_hrtimeout_range_clock - sleep until timeout   * @expires:	timeout value (ktime_t) - * @delta:	slack in expires timeout (ktime_t) + * @delta:	slack in expires timeout (ktime_t) for SCHED_OTHER tasks   * @mode:	timer mode   * @clock_id:	timer clock to be used   */ @@ -2297,6 +2299,13 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,  		return -EINTR;  	} +	/* +	 * Override any slack passed by the user if under +	 * rt contraints. +	 */ +	if (rt_task(current)) +		delta = 0; +  	hrtimer_init_sleeper_on_stack(&t, clock_id, mode);  	hrtimer_set_expires_range_ns(&t.timer, *expires, delta);  	hrtimer_sleeper_start_expires(&t, mode); @@ -2316,7 +2325,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);  /**   * schedule_hrtimeout_range - sleep until timeout   * @expires:	timeout value (ktime_t) - * @delta:	slack in expires timeout (ktime_t) + * @delta:	slack in expires timeout (ktime_t) for SCHED_OTHER tasks   * @mode:	timer mode   *   * Make the current task sleep until the given expiry time has @@ -2324,7 +2333,8 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);   * the current task state has been set (see set_current_state()).   *   * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly. + * actual wakeup to a time that is both power and performance friendly + * for regular (non RT/DL) tasks.   * The kernel give the normal best effort behavior for "@expires+@delta",   * but may decide to fire the timer earlier, but no earlier than @expires.   * diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index cb925e8ef9a8..2f5e9b34022c 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -243,13 +243,12 @@ static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,   */  static inline void __update_gt_cputime(atomic64_t *cputime, u64 sum_cputime)  { -	u64 curr_cputime; -retry: -	curr_cputime = atomic64_read(cputime); -	if (sum_cputime > curr_cputime) { -		if (atomic64_cmpxchg(cputime, curr_cputime, sum_cputime) != curr_cputime) -			goto retry; -	} +	u64 curr_cputime = atomic64_read(cputime); + +	do { +		if (sum_cputime <= curr_cputime) +			return; +	} while (!atomic64_try_cmpxchg(cputime, &curr_cputime, sum_cputime));  }  static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 90ea5f373e50..828aeecbd1e8 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -147,6 +147,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,  		return -EINVAL;  	if (flags & TIMER_ABSTIME)  		rmtp = NULL; +	current->restart_block.fn = do_no_restart_syscall;  	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;  	current->restart_block.nanosleep.rmtp = rmtp;  	texp = timespec64_to_ktime(t); @@ -240,6 +241,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,  		return -EINVAL;  	if (flags & TIMER_ABSTIME)  		rmtp = NULL; +	current->restart_block.fn = do_no_restart_syscall;  	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;  	current->restart_block.nanosleep.compat_rmtp = rmtp;  	texp = timespec64_to_ktime(t); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 5dead89308b7..0c8a87a11b39 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1270,6 +1270,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,  		return -EINVAL;  	if (flags & TIMER_ABSTIME)  		rmtp = NULL; +	current->restart_block.fn = do_no_restart_syscall;  	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;  	current->restart_block.nanosleep.rmtp = rmtp; @@ -1297,6 +1298,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags,  		return -EINVAL;  	if (flags & TIMER_ABSTIME)  		rmtp = NULL; +	current->restart_block.fn = do_no_restart_syscall;  	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;  	current->restart_block.nanosleep.compat_rmtp = rmtp; diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c index 13b11eb62685..20d5df631570 100644 --- a/kernel/time/test_udelay.c +++ b/kernel/time/test_udelay.c @@ -149,7 +149,7 @@ module_init(udelay_test_init);  static void __exit udelay_test_exit(void)  {  	mutex_lock(&udelay_test_lock); -	debugfs_remove(debugfs_lookup(DEBUGFS_FILENAME, NULL)); +	debugfs_lookup_and_remove(DEBUGFS_FILENAME, NULL);  	mutex_unlock(&udelay_test_lock);  } diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 797eb93103ad..e28f9210f8a1 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -56,25 +56,20 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)  	 * hrtimer callback function is currently running, then  	 * hrtimer_start() cannot move it and the timer stays on the CPU on  	 * which it is assigned at the moment. +	 */ +	hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); +	/* +	 * The core tick broadcast mode expects bc->bound_on to be set +	 * correctly to prevent a CPU which has the broadcast hrtimer +	 * armed from going deep idle.  	 * -	 * As this can be called from idle code, the hrtimer_start() -	 * invocation has to be wrapped with RCU_NONIDLE() as -	 * hrtimer_start() can call into tracing. +	 * As tick_broadcast_lock is held, nothing can change the cpu +	 * base which was just established in hrtimer_start() above. So +	 * the below access is safe even without holding the hrtimer +	 * base lock.  	 */ -	RCU_NONIDLE( { -		hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED_HARD); -		/* -		 * The core tick broadcast mode expects bc->bound_on to be set -		 * correctly to prevent a CPU which has the broadcast hrtimer -		 * armed from going deep idle. -		 * -		 * As tick_broadcast_lock is held, nothing can change the cpu -		 * base which was just established in hrtimer_start() above. So -		 * the below access is safe even without holding the hrtimer -		 * base lock. -		 */ -		bc->bound_on = bctimer.base->cpu_base->cpu; -	} ); +	bc->bound_on = bctimer.base->cpu_base->cpu; +  	return 0;  } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f7fe6fe36173..93bf2b4e47e5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -622,9 +622,13 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)   * to avoid a deep idle transition as we are about to get the   * broadcast IPI right away.   */ -int tick_check_broadcast_expired(void) +noinstr int tick_check_broadcast_expired(void)  { +#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H +	return arch_test_bit(smp_processor_id(), cpumask_bits(tick_broadcast_force_mask)); +#else  	return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +#endif  }  /* diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 475ecceda768..5e2c2c26b3cc 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -18,7 +18,7 @@  #include "tick-internal.h"  /** - * tick_program_event + * tick_program_event - program the CPU local timer device for the next event   */  int tick_program_event(ktime_t expires, int force)  { @@ -99,7 +99,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))  }  /** - * tick_check_oneshot_mode - check whether the system is in oneshot mode + * tick_oneshot_mode_active - check whether the system is in oneshot mode   *   * returns 1 when either nohz or highres are enabled. otherwise 0.   */ diff --git a/kernel/time/time.c b/kernel/time/time.c index 526257b3727c..f4198af60fee 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -462,7 +462,7 @@ struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec)  EXPORT_SYMBOL(ns_to_kernel_old_timeval);  /** - * set_normalized_timespec - set timespec sec and nsec parts and normalize + * set_normalized_timespec64 - set timespec sec and nsec parts and normalize   *   * @ts:		pointer to timespec variable to be set   * @sec:	seconds to set @@ -526,7 +526,7 @@ struct timespec64 ns_to_timespec64(s64 nsec)  EXPORT_SYMBOL(ns_to_timespec64);  /** - * msecs_to_jiffies: - convert milliseconds to jiffies + * __msecs_to_jiffies: - convert milliseconds to jiffies   * @m:	time in milliseconds   *   * conversion is done as follows: @@ -541,12 +541,12 @@ EXPORT_SYMBOL(ns_to_timespec64);   *   handling any 32-bit overflows.   *   for the details see __msecs_to_jiffies()   * - * msecs_to_jiffies() checks for the passed in value being a constant + * __msecs_to_jiffies() checks for the passed in value being a constant   * via __builtin_constant_p() allowing gcc to eliminate most of the   * code, __msecs_to_jiffies() is called if the value passed does not   * allow constant folding and the actual conversion must be done at   * runtime. - * the _msecs_to_jiffies helpers are the HZ dependent conversion + * The _msecs_to_jiffies helpers are the HZ dependent conversion   * routines found in include/linux/jiffies.h   */  unsigned long __msecs_to_jiffies(const unsigned int m) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f72b9f1de178..5579ead449f2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1590,10 +1590,10 @@ void __weak read_persistent_clock64(struct timespec64 *ts)  /**   * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset   *                                        from the boot. + * @wall_time:	  current time as returned by persistent clock + * @boot_offset:  offset that is defined as wall_time - boot_time   *   * Weak dummy function for arches that do not yet support it. - * @wall_time:	- current time as returned by persistent clock - * @boot_offset: - offset that is defined as wall_time - boot_time   *   * The default function calculates offset based on the current value of   * local_clock(). This way architectures that support sched_clock() but don't @@ -1701,7 +1701,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,  }  #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) -/** +/*   * We have three kinds of time sources to use for sleep time   * injection, the preference order is:   * 1) non-stop clocksource @@ -1722,7 +1722,7 @@ bool timekeeping_rtc_skipresume(void)  	return !suspend_timing_needed;  } -/** +/*   * 1) can be determined whether to use or not only when doing   * timekeeping_resume() which is invoked after rtc_suspend(),   * so we can't skip rtc_suspend() surely if system has 1). diff --git a/kernel/torture.c b/kernel/torture.c index 789aeb0e1159..1a0519b836ac 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -450,7 +450,7 @@ unsigned long  torture_random(struct torture_random_state *trsp)  {  	if (--trsp->trs_count < 0) { -		trsp->trs_state += (unsigned long)local_clock(); +		trsp->trs_state += (unsigned long)local_clock() + raw_smp_processor_id();  		trsp->trs_count = TORTURE_RANDOM_REFRESH;  	}  	trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT + @@ -915,7 +915,7 @@ void torture_kthread_stopping(char *title)  	VERBOSE_TOROUT_STRING(buf);  	while (!kthread_should_stop()) {  		torture_shutdown_absorb(title); -		schedule_timeout_uninterruptible(1); +		schedule_timeout_uninterruptible(HZ / 20);  	}  }  EXPORT_SYMBOL_GPL(torture_kthread_stopping); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 197545241ab8..a856d4a34c67 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -42,6 +42,9 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS  config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS  	bool +config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS +	bool +  config HAVE_DYNAMIC_FTRACE_WITH_ARGS  	bool  	help @@ -239,7 +242,7 @@ config DYNAMIC_FTRACE  	  enabled, and the functions not enabled will not affect  	  performance of the system. -	  See the files in /sys/kernel/debug/tracing: +	  See the files in /sys/kernel/tracing:  	    available_filter_functions  	    set_ftrace_filter  	    set_ftrace_notrace @@ -257,6 +260,10 @@ config DYNAMIC_FTRACE_WITH_DIRECT_CALLS  	depends on DYNAMIC_FTRACE_WITH_REGS  	depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +config DYNAMIC_FTRACE_WITH_CALL_OPS +	def_bool y +	depends on HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS +  config DYNAMIC_FTRACE_WITH_ARGS  	def_bool y  	depends on DYNAMIC_FTRACE @@ -299,7 +306,7 @@ config STACK_TRACER  	select KALLSYMS  	help  	  This special tracer records the maximum stack footprint of the -	  kernel and displays it in /sys/kernel/debug/tracing/stack_trace. +	  kernel and displays it in /sys/kernel/tracing/stack_trace.  	  This tracer works by hooking into every function call that the  	  kernel executes, and keeping a maximum stack depth value and @@ -339,7 +346,7 @@ config IRQSOFF_TRACER  	  disabled by default and can be runtime (re-)started  	  via: -	      echo 0 > /sys/kernel/debug/tracing/tracing_max_latency +	      echo 0 > /sys/kernel/tracing/tracing_max_latency  	  (Note that kernel size and overhead increase with this option  	  enabled. This option and the preempt-off timing option can be @@ -363,7 +370,7 @@ config PREEMPT_TRACER  	  disabled by default and can be runtime (re-)started  	  via: -	      echo 0 > /sys/kernel/debug/tracing/tracing_max_latency +	      echo 0 > /sys/kernel/tracing/tracing_max_latency  	  (Note that kernel size and overhead increase with this option  	  enabled. This option and the irqs-off timing option can be @@ -515,7 +522,7 @@ config TRACER_SNAPSHOT  	  Allow tracing users to take snapshot of the current buffer using the  	  ftrace interface, e.g.: -	      echo 1 > /sys/kernel/debug/tracing/snapshot +	      echo 1 > /sys/kernel/tracing/snapshot  	      cat snapshot  config TRACER_SNAPSHOT_PER_CPU_SWAP @@ -527,7 +534,7 @@ config TRACER_SNAPSHOT_PER_CPU_SWAP  	  full swap (all buffers). If this is set, then the following is  	  allowed: -	      echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot +	      echo 1 > /sys/kernel/tracing/per_cpu/cpu2/snapshot  	  After which, only the tracing buffer for CPU 2 was swapped with  	  the main tracing buffer, and the other CPU buffers remain the same. @@ -574,7 +581,7 @@ config PROFILE_ANNOTATED_BRANCHES  	  This tracer profiles all likely and unlikely macros  	  in the kernel. It will display the results in: -	  /sys/kernel/debug/tracing/trace_stat/branch_annotated +	  /sys/kernel/tracing/trace_stat/branch_annotated  	  Note: this will add a significant overhead; only turn this  	  on if you need to profile the system's use of these macros. @@ -587,7 +594,7 @@ config PROFILE_ALL_BRANCHES  	  taken in the kernel is recorded whether it hit or miss.  	  The results will be displayed in: -	  /sys/kernel/debug/tracing/trace_stat/branch_all +	  /sys/kernel/tracing/trace_stat/branch_all  	  This option also enables the likely/unlikely profiler. @@ -638,8 +645,8 @@ config BLK_DEV_IO_TRACE  	  Tracing also is possible using the ftrace interface, e.g.:  	    echo 1 > /sys/block/sda/sda1/trace/enable -	    echo blk > /sys/kernel/debug/tracing/current_tracer -	    cat /sys/kernel/debug/tracing/trace_pipe +	    echo blk > /sys/kernel/tracing/current_tracer +	    cat /sys/kernel/tracing/trace_pipe  	  If unsure, say N. @@ -933,8 +940,8 @@ config RING_BUFFER_RECORD_RECURSION  	default y  	help  	  The ring buffer has its own internal recursion. Although when -	  recursion happens it wont cause harm because of the protection, -	  but it does cause an unwanted overhead. Enabling this option will +	  recursion happens it won't cause harm because of the protection, +	  but it does cause unwanted overhead. Enabling this option will  	  place where recursion was detected into the ftrace "recursed_functions"  	  file. @@ -1017,8 +1024,8 @@ config RING_BUFFER_STARTUP_TEST  	 The test runs for 10 seconds. This will slow your boot time  	 by at least 10 more seconds. -	 At the end of the test, statics and more checks are done. -	 It will output the stats of each per cpu buffer. What +	 At the end of the test, statistics and more checks are done. +	 It will output the stats of each per cpu buffer: What  	 was written, the sizes, what was read, what was lost, and  	 other similar details. diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 918a7d12df8f..d5d94510afd3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -320,8 +320,8 @@ static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)  	 * under 'q->debugfs_dir', thus lookup and remove them.  	 */  	if (!bt->dir) { -		debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir)); -		debugfs_remove(debugfs_lookup("msg", q->debugfs_dir)); +		debugfs_lookup_and_remove("dropped", q->debugfs_dir); +		debugfs_lookup_and_remove("msg", q->debugfs_dir);  	} else {  		debugfs_remove(bt->dir);  	} @@ -729,14 +729,10 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);   **/  int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)  { -	struct request_queue *q; +	struct request_queue *q = bdev_get_queue(bdev);  	int ret, start = 0;  	char b[BDEVNAME_SIZE]; -	q = bdev_get_queue(bdev); -	if (!q) -		return -ENXIO; -  	mutex_lock(&q->debugfs_mutex);  	switch (cmd) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3bbd3f0c810c..e8da032bb6fc 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -369,8 +369,6 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)  	return &bpf_probe_write_user_proto;  } -static DEFINE_RAW_SPINLOCK(trace_printk_lock); -  #define MAX_TRACE_PRINTK_VARARGS	3  #define BPF_TRACE_PRINTK_SIZE		1024 @@ -378,23 +376,22 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,  	   u64, arg2, u64, arg3)  {  	u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; -	u32 *bin_args; -	static char buf[BPF_TRACE_PRINTK_SIZE]; -	unsigned long flags; +	struct bpf_bprintf_data data = { +		.get_bin_args	= true, +		.get_buf	= true, +	};  	int ret; -	ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args, -				  MAX_TRACE_PRINTK_VARARGS); +	ret = bpf_bprintf_prepare(fmt, fmt_size, args, +				  MAX_TRACE_PRINTK_VARARGS, &data);  	if (ret < 0)  		return ret; -	raw_spin_lock_irqsave(&trace_printk_lock, flags); -	ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); +	ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); -	trace_bpf_trace_printk(buf); -	raw_spin_unlock_irqrestore(&trace_printk_lock, flags); +	trace_bpf_trace_printk(data.buf); -	bpf_bprintf_cleanup(); +	bpf_bprintf_cleanup(&data);  	return ret;  } @@ -427,30 +424,29 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)  	return &bpf_trace_printk_proto;  } -BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, data, +BPF_CALL_4(bpf_trace_vprintk, char *, fmt, u32, fmt_size, const void *, args,  	   u32, data_len)  { -	static char buf[BPF_TRACE_PRINTK_SIZE]; -	unsigned long flags; +	struct bpf_bprintf_data data = { +		.get_bin_args	= true, +		.get_buf	= true, +	};  	int ret, num_args; -	u32 *bin_args;  	if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || -	    (data_len && !data)) +	    (data_len && !args))  		return -EINVAL;  	num_args = data_len / 8; -	ret = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); +	ret = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);  	if (ret < 0)  		return ret; -	raw_spin_lock_irqsave(&trace_printk_lock, flags); -	ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); +	ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt, data.bin_args); -	trace_bpf_trace_printk(buf); -	raw_spin_unlock_irqrestore(&trace_printk_lock, flags); +	trace_bpf_trace_printk(data.buf); -	bpf_bprintf_cleanup(); +	bpf_bprintf_cleanup(&data);  	return ret;  } @@ -472,23 +468,25 @@ const struct bpf_func_proto *bpf_get_trace_vprintk_proto(void)  }  BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, -	   const void *, data, u32, data_len) +	   const void *, args, u32, data_len)  { +	struct bpf_bprintf_data data = { +		.get_bin_args	= true, +	};  	int err, num_args; -	u32 *bin_args;  	if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || -	    (data_len && !data)) +	    (data_len && !args))  		return -EINVAL;  	num_args = data_len / 8; -	err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); +	err = bpf_bprintf_prepare(fmt, fmt_size, args, num_args, &data);  	if (err < 0)  		return err; -	seq_bprintf(m, fmt, bin_args); +	seq_bprintf(m, fmt, data.bin_args); -	bpf_bprintf_cleanup(); +	bpf_bprintf_cleanup(&data);  	return seq_has_overflowed(m) ? -EOVERFLOW : 0;  } @@ -687,8 +685,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,  	}  	perf_sample_data_init(sd, 0, 0); -	sd->raw = &raw; -	sd->sample_flags |= PERF_SAMPLE_RAW; +	perf_sample_save_raw_data(sd, &raw);  	err = __bpf_perf_event_output(regs, map, flags, sd); @@ -746,8 +743,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,  	perf_fetch_caller_regs(regs);  	perf_sample_data_init(sd, 0, 0); -	sd->raw = &raw; -	sd->sample_flags |= PERF_SAMPLE_RAW; +	perf_sample_save_raw_data(sd, &raw);  	ret = __bpf_perf_event_output(regs, map, flags, sd);  out: @@ -833,6 +829,7 @@ static void do_bpf_send_signal(struct irq_work *entry)  	work = container_of(entry, struct send_signal_irq_work, irq_work);  	group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type); +	put_task_struct(work->task);  }  static int bpf_send_signal_common(u32 sig, enum pid_type type) @@ -848,6 +845,9 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)  		return -EPERM;  	if (unlikely(!nmi_uaccess_okay()))  		return -EPERM; +	/* Task should not be pid=1 to avoid kernel panic. */ +	if (unlikely(is_global_init(current))) +		return -EPERM;  	if (irqs_disabled()) {  		/* Do an early check on signal validity. Otherwise, @@ -864,7 +864,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)  		 * to the irq_work. The current task may change when queued  		 * irq works get executed.  		 */ -		work->task = current; +		work->task = get_task_struct(current);  		work->sig = sig;  		work->type = type;  		irq_work_queue(&work->irq_work); @@ -1235,7 +1235,7 @@ __diag_ignore_all("-Wmissing-prototypes",   * Return: a bpf_key pointer with a valid key pointer if the key is found, a   *         NULL pointer otherwise.   */ -struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)  {  	key_ref_t key_ref;  	struct bpf_key *bkey; @@ -1284,7 +1284,7 @@ struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)   * Return: a bpf_key pointer with an invalid key pointer set from the   *         pre-determined ID on success, a NULL pointer otherwise   */ -struct bpf_key *bpf_lookup_system_key(u64 id) +__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id)  {  	struct bpf_key *bkey; @@ -1308,7 +1308,7 @@ struct bpf_key *bpf_lookup_system_key(u64 id)   * Decrement the reference count of the key inside *bkey*, if the pointer   * is valid, and free *bkey*.   */ -void bpf_key_put(struct bpf_key *bkey) +__bpf_kfunc void bpf_key_put(struct bpf_key *bkey)  {  	if (bkey->has_ref)  		key_put(bkey->key); @@ -1328,7 +1328,7 @@ void bpf_key_put(struct bpf_key *bkey)   *   * Return: 0 on success, a negative value on error.   */ -int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr, +__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,  			       struct bpf_dynptr_kern *sig_ptr,  			       struct bpf_key *trusted_keyring)  { @@ -2684,69 +2684,77 @@ static void symbols_swap_r(void *a, void *b, int size, const void *priv)  	}  } -struct module_addr_args { -	unsigned long *addrs; -	u32 addrs_cnt; +struct modules_array {  	struct module **mods;  	int mods_cnt;  	int mods_cap;  }; -static int module_callback(void *data, const char *name, -			   struct module *mod, unsigned long addr) +static int add_module(struct modules_array *arr, struct module *mod)  { -	struct module_addr_args *args = data;  	struct module **mods; -	/* We iterate all modules symbols and for each we: -	 * - search for it in provided addresses array -	 * - if found we check if we already have the module pointer stored -	 *   (we iterate modules sequentially, so we can check just the last -	 *   module pointer) -	 * - take module reference and store it -	 */ -	if (!bsearch(&addr, args->addrs, args->addrs_cnt, sizeof(addr), -		       bpf_kprobe_multi_addrs_cmp)) -		return 0; - -	if (args->mods && args->mods[args->mods_cnt - 1] == mod) -		return 0; - -	if (args->mods_cnt == args->mods_cap) { -		args->mods_cap = max(16, args->mods_cap * 3 / 2); -		mods = krealloc_array(args->mods, args->mods_cap, sizeof(*mods), GFP_KERNEL); +	if (arr->mods_cnt == arr->mods_cap) { +		arr->mods_cap = max(16, arr->mods_cap * 3 / 2); +		mods = krealloc_array(arr->mods, arr->mods_cap, sizeof(*mods), GFP_KERNEL);  		if (!mods)  			return -ENOMEM; -		args->mods = mods; +		arr->mods = mods;  	} -	if (!try_module_get(mod)) -		return -EINVAL; - -	args->mods[args->mods_cnt] = mod; -	args->mods_cnt++; +	arr->mods[arr->mods_cnt] = mod; +	arr->mods_cnt++;  	return 0;  } +static bool has_module(struct modules_array *arr, struct module *mod) +{ +	int i; + +	for (i = arr->mods_cnt - 1; i >= 0; i--) { +		if (arr->mods[i] == mod) +			return true; +	} +	return false; +} +  static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u32 addrs_cnt)  { -	struct module_addr_args args = { -		.addrs     = addrs, -		.addrs_cnt = addrs_cnt, -	}; -	int err; +	struct modules_array arr = {}; +	u32 i, err = 0; + +	for (i = 0; i < addrs_cnt; i++) { +		struct module *mod; + +		preempt_disable(); +		mod = __module_address(addrs[i]); +		/* Either no module or we it's already stored  */ +		if (!mod || has_module(&arr, mod)) { +			preempt_enable(); +			continue; +		} +		if (!try_module_get(mod)) +			err = -EINVAL; +		preempt_enable(); +		if (err) +			break; +		err = add_module(&arr, mod); +		if (err) { +			module_put(mod); +			break; +		} +	}  	/* We return either err < 0 in case of error, ... */ -	err = module_kallsyms_on_each_symbol(module_callback, &args);  	if (err) { -		kprobe_multi_put_modules(args.mods, args.mods_cnt); -		kfree(args.mods); +		kprobe_multi_put_modules(arr.mods, arr.mods_cnt); +		kfree(arr.mods);  		return err;  	}  	/* or number of modules found if everything is ok. */ -	*mods = args.mods; -	return args.mods_cnt; +	*mods = arr.mods; +	return arr.mods_cnt;  }  int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) @@ -2859,13 +2867,6 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr  		       bpf_kprobe_multi_cookie_cmp,  		       bpf_kprobe_multi_cookie_swap,  		       link); -	} else { -		/* -		 * We need to sort addrs array even if there are no cookies -		 * provided, to allow bsearch in get_modules_for_addrs. -		 */ -		sort(addrs, cnt, sizeof(*addrs), -		       bpf_kprobe_multi_addrs_cmp, NULL);  	}  	err = get_modules_for_addrs(&link->mods, addrs, cnt); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 442438b93fe9..29baa97d0d53 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -125,6 +125,33 @@ struct ftrace_ops global_ops;  void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,  			  struct ftrace_ops *op, struct ftrace_regs *fregs); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS +/* + * Stub used to invoke the list ops without requiring a separate trampoline. + */ +const struct ftrace_ops ftrace_list_ops = { +	.func	= ftrace_ops_list_func, +	.flags	= FTRACE_OPS_FL_STUB, +}; + +static void ftrace_ops_nop_func(unsigned long ip, unsigned long parent_ip, +				struct ftrace_ops *op, +				struct ftrace_regs *fregs) +{ +	/* do nothing */ +} + +/* + * Stub used when a call site is disabled. May be called transiently by threads + * which have made it into ftrace_caller but haven't yet recovered the ops at + * the point the call site is disabled. + */ +const struct ftrace_ops ftrace_nop_ops = { +	.func	= ftrace_ops_nop_func, +	.flags  = FTRACE_OPS_FL_STUB, +}; +#endif +  static inline void ftrace_ops_init(struct ftrace_ops *ops)  {  #ifdef CONFIG_DYNAMIC_FTRACE @@ -1248,12 +1275,17 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)  	call_rcu(&hash->rcu, __free_ftrace_hash_rcu);  } +/** + * ftrace_free_filter - remove all filters for an ftrace_ops + * @ops - the ops to remove the filters from + */  void ftrace_free_filter(struct ftrace_ops *ops)  {  	ftrace_ops_init(ops);  	free_ftrace_hash(ops->func_hash->filter_hash);  	free_ftrace_hash(ops->func_hash->notrace_hash);  } +EXPORT_SYMBOL_GPL(ftrace_free_filter);  static struct ftrace_hash *alloc_ftrace_hash(int size_bits)  { @@ -1814,6 +1846,18 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,  			 * if rec count is zero.  			 */  		} + +		/* +		 * If the rec has a single associated ops, and ops->func can be +		 * called directly, allow the call site to call via the ops. +		 */ +		if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS) && +		    ftrace_rec_count(rec) == 1 && +		    ftrace_ops_get_func(ops) == ops->func) +			rec->flags |= FTRACE_FL_CALL_OPS; +		else +			rec->flags &= ~FTRACE_FL_CALL_OPS; +  		count++;  		/* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ @@ -2108,8 +2152,9 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)  		struct ftrace_ops *ops = NULL;  		pr_info("ftrace record flags: %lx\n", rec->flags); -		pr_cont(" (%ld)%s", ftrace_rec_count(rec), -			rec->flags & FTRACE_FL_REGS ? " R" : "  "); +		pr_cont(" (%ld)%s%s", ftrace_rec_count(rec), +			rec->flags & FTRACE_FL_REGS ? " R" : "  ", +			rec->flags & FTRACE_FL_CALL_OPS ? " O" : "  ");  		if (rec->flags & FTRACE_FL_TRAMP_EN) {  			ops = ftrace_find_tramp_ops_any(rec);  			if (ops) { @@ -2177,6 +2222,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)  		 * want the direct enabled (it will be done via the  		 * direct helper). But if DIRECT_EN is set, and  		 * the count is not one, we need to clear it. +		 *  		 */  		if (ftrace_rec_count(rec) == 1) {  			if (!(rec->flags & FTRACE_FL_DIRECT) != @@ -2185,6 +2231,19 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)  		} else if (rec->flags & FTRACE_FL_DIRECT_EN) {  			flag |= FTRACE_FL_DIRECT;  		} + +		/* +		 * Ops calls are special, as count matters. +		 * As with direct calls, they must only be enabled when count +		 * is one, otherwise they'll be handled via the list ops. +		 */ +		if (ftrace_rec_count(rec) == 1) { +			if (!(rec->flags & FTRACE_FL_CALL_OPS) != +			    !(rec->flags & FTRACE_FL_CALL_OPS_EN)) +				flag |= FTRACE_FL_CALL_OPS; +		} else if (rec->flags & FTRACE_FL_CALL_OPS_EN) { +			flag |= FTRACE_FL_CALL_OPS; +		}  	}  	/* If the state of this record hasn't changed, then do nothing */ @@ -2229,6 +2288,21 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)  					rec->flags &= ~FTRACE_FL_DIRECT_EN;  				}  			} + +			if (flag & FTRACE_FL_CALL_OPS) { +				if (ftrace_rec_count(rec) == 1) { +					if (rec->flags & FTRACE_FL_CALL_OPS) +						rec->flags |= FTRACE_FL_CALL_OPS_EN; +					else +						rec->flags &= ~FTRACE_FL_CALL_OPS_EN; +				} else { +					/* +					 * Can only call directly if there's +					 * only one set of associated ops. +					 */ +					rec->flags &= ~FTRACE_FL_CALL_OPS_EN; +				} +			}  		}  		/* @@ -2258,7 +2332,8 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)  			 * and REGS states. The _EN flags must be disabled though.  			 */  			rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | -					FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN); +					FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN | +					FTRACE_FL_CALL_OPS_EN);  	}  	ftrace_bug_type = FTRACE_BUG_NOP; @@ -2431,6 +2506,25 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)  	return NULL;  } +struct ftrace_ops * +ftrace_find_unique_ops(struct dyn_ftrace *rec) +{ +	struct ftrace_ops *op, *found = NULL; +	unsigned long ip = rec->ip; + +	do_for_each_ftrace_op(op, ftrace_ops_list) { + +		if (hash_contains_ip(ip, op->func_hash)) { +			if (found) +				return NULL; +			found = op; +		} + +	} while_for_each_ftrace_op(op); + +	return found; +} +  #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS  /* Protected by rcu_tasks for reading, and direct_mutex for writing */  static struct ftrace_hash *direct_functions = EMPTY_HASH; @@ -3780,11 +3874,12 @@ static int t_show(struct seq_file *m, void *v)  	if (iter->flags & FTRACE_ITER_ENABLED) {  		struct ftrace_ops *ops; -		seq_printf(m, " (%ld)%s%s%s", +		seq_printf(m, " (%ld)%s%s%s%s",  			   ftrace_rec_count(rec),  			   rec->flags & FTRACE_FL_REGS ? " R" : "  ",  			   rec->flags & FTRACE_FL_IPMODIFY ? " I" : "  ", -			   rec->flags & FTRACE_FL_DIRECT ? " D" : "  "); +			   rec->flags & FTRACE_FL_DIRECT ? " D" : "  ", +			   rec->flags & FTRACE_FL_CALL_OPS ? " O" : "  ");  		if (rec->flags & FTRACE_FL_TRAMP_EN) {  			ops = ftrace_find_tramp_ops_any(rec);  			if (ops) { @@ -3800,6 +3895,15 @@ static int t_show(struct seq_file *m, void *v)  		} else {  			add_trampoline_func(m, NULL, rec);  		} +		if (rec->flags & FTRACE_FL_CALL_OPS_EN) { +			ops = ftrace_find_unique_ops(rec); +			if (ops) { +				seq_printf(m, "\tops: %pS (%pS)", +					   ops, ops->func); +			} else { +				seq_puts(m, "\tops: ERROR!"); +			} +		}  		if (rec->flags & FTRACE_FL_DIRECT) {  			unsigned long direct; @@ -5839,6 +5943,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi);   *   * Filters denote which functions should be enabled when tracing is enabled   * If @ip is NULL, it fails to update filter. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops).   */  int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,  			 int remove, int reset) @@ -5858,7 +5966,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);   *   * Filters denote which functions should be enabled when tracing is enabled   * If @ips array or any ip specified within is NULL , it fails to update filter. - */ + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops). +*/  int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,  			  unsigned int cnt, int remove, int reset)  { @@ -5900,6 +6012,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,   *   * Filters denote which functions should be enabled when tracing is enabled.   * If @buf is NULL and reset is set, all functions will be enabled for tracing. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops).   */  int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,  		       int len, int reset) @@ -5919,6 +6035,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);   * Notrace Filters denote which functions should not be enabled when tracing   * is enabled. If @buf is NULL and reset is set, all functions will be enabled   * for tracing. + * + * This can allocate memory which must be freed before @ops can be freed, + * either by removing each filtered addr or by using + * ftrace_free_filter(@ops).   */  int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,  			int len, int reset) @@ -8324,7 +8444,7 @@ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *a  	found_all = kallsyms_on_each_symbol(kallsyms_callback, &args);  	if (found_all)  		return 0; -	found_all = module_kallsyms_on_each_symbol(kallsyms_callback, &args); +	found_all = module_kallsyms_on_each_symbol(NULL, kallsyms_callback, &args);  	return found_all ? 0 : -ESRCH;  } diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c index c736487fc0e4..4850fdfe27f1 100644 --- a/kernel/trace/kprobe_event_gen_test.c +++ b/kernel/trace/kprobe_event_gen_test.c @@ -21,7 +21,7 @@   * Then:   *   * # insmod kernel/trace/kprobe_event_gen_test.ko - * # cat /sys/kernel/debug/tracing/trace + * # cat /sys/kernel/tracing/trace   *   * You should see many instances of the "gen_kprobe_test" and   * "gen_kretprobe_test" events in the trace buffer. diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c366a0a9ddba..af50d931b020 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1581,19 +1581,6 @@ static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,  }  /** - * rb_check_list - make sure a pointer to a list has the last bits zero - */ -static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, -			 struct list_head *list) -{ -	if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) -		return 1; -	if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) -		return 1; -	return 0; -} - -/**   * rb_check_pages - integrity check of buffer pages   * @cpu_buffer: CPU buffer with pages to test   * @@ -1602,36 +1589,27 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,   */  static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)  { -	struct list_head *head = cpu_buffer->pages; -	struct buffer_page *bpage, *tmp; - -	/* Reset the head page if it exists */ -	if (cpu_buffer->head_page) -		rb_set_head_page(cpu_buffer); - -	rb_head_page_deactivate(cpu_buffer); +	struct list_head *head = rb_list_head(cpu_buffer->pages); +	struct list_head *tmp; -	if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) -		return -1; -	if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) +	if (RB_WARN_ON(cpu_buffer, +			rb_list_head(rb_list_head(head->next)->prev) != head))  		return -1; -	if (rb_check_list(cpu_buffer, head)) +	if (RB_WARN_ON(cpu_buffer, +			rb_list_head(rb_list_head(head->prev)->next) != head))  		return -1; -	list_for_each_entry_safe(bpage, tmp, head, list) { +	for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {  		if (RB_WARN_ON(cpu_buffer, -			       bpage->list.next->prev != &bpage->list)) +				rb_list_head(rb_list_head(tmp->next)->prev) != tmp))  			return -1; +  		if (RB_WARN_ON(cpu_buffer, -			       bpage->list.prev->next != &bpage->list)) -			return -1; -		if (rb_check_list(cpu_buffer, &bpage->list)) +				rb_list_head(rb_list_head(tmp->prev)->next) != tmp))  			return -1;  	} -	rb_head_page_activate(cpu_buffer); -  	return 0;  } @@ -2886,7 +2864,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,  		  sched_clock_stable() ? "" :  		  "If you just came from a suspend/resume,\n"  		  "please switch to the trace global clock:\n" -		  "  echo global > /sys/kernel/debug/tracing/trace_clock\n" +		  "  echo global > /sys/kernel/tracing/trace_clock\n"  		  "or add trace_clock=global to the kernel command line\n");  } @@ -5626,11 +5604,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);   */  void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)  { -	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; +	struct ring_buffer_per_cpu *cpu_buffer;  	struct buffer_data_page *bpage = data;  	struct page *page = virt_to_page(bpage);  	unsigned long flags; +	if (!buffer || !buffer->buffers || !buffer->buffers[cpu]) +		return; + +	cpu_buffer = buffer->buffers[cpu]; +  	/* If the page is still in use someplace else, we can't reuse it */  	if (page_ref_count(page) > 1)  		goto out; diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 6c97cc2d754a..7e9061828c24 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -516,7 +516,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user  	struct rv_monitor_def *mdef;  	int retval = -EINVAL;  	bool enable = true; -	char *ptr = buff; +	char *ptr;  	int len;  	if (count < 1 || count > MAX_RV_MONITOR_NAME_SIZE + 1) diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 8d77526892f4..8dfe85499d4a 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -22,7 +22,7 @@   * Then:   *   * # insmod kernel/trace/synth_event_gen_test.ko - * # cat /sys/kernel/debug/tracing/trace + * # cat /sys/kernel/tracing/trace   *   * You should see several events in the trace buffer -   * "create_synth_test", "empty_synth_test", and several instances of diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a555a861b978..45551c7b4c36 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -49,6 +49,8 @@  #include <linux/irq_work.h>  #include <linux/workqueue.h> +#include <asm/setup.h> /* COMMAND_LINE_SIZE */ +  #include "trace.h"  #include "trace_output.h" @@ -186,6 +188,12 @@ static char *default_bootup_tracer;  static bool allocate_snapshot;  static bool snapshot_at_boot; +static char boot_instance_info[COMMAND_LINE_SIZE] __initdata; +static int boot_instance_index; + +static char boot_snapshot_info[COMMAND_LINE_SIZE] __initdata; +static int boot_snapshot_index; +  static int __init set_cmdline_ftrace(char *str)  {  	strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); @@ -222,9 +230,22 @@ __setup("traceoff_on_warning", stop_trace_on_warning);  static int __init boot_alloc_snapshot(char *str)  { -	allocate_snapshot = true; -	/* We also need the main ring buffer expanded */ -	ring_buffer_expanded = true; +	char *slot = boot_snapshot_info + boot_snapshot_index; +	int left = sizeof(boot_snapshot_info) - boot_snapshot_index; +	int ret; + +	if (str[0] == '=') { +		str++; +		if (strlen(str) >= left) +			return -1; + +		ret = snprintf(slot, left, "%s\t", str); +		boot_snapshot_index += ret; +	} else { +		allocate_snapshot = true; +		/* We also need the main ring buffer expanded */ +		ring_buffer_expanded = true; +	}  	return 1;  }  __setup("alloc_snapshot", boot_alloc_snapshot); @@ -239,6 +260,23 @@ static int __init boot_snapshot(char *str)  __setup("ftrace_boot_snapshot", boot_snapshot); +static int __init boot_instance(char *str) +{ +	char *slot = boot_instance_info + boot_instance_index; +	int left = sizeof(boot_instance_info) - boot_instance_index; +	int ret; + +	if (strlen(str) >= left) +		return -1; + +	ret = snprintf(slot, left, "%s\t", str); +	boot_instance_index += ret; + +	return 1; +} +__setup("trace_instance=", boot_instance); + +  static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;  static int __init set_trace_boot_options(char *str) @@ -1001,13 +1039,8 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev  		ring_buffer_unlock_commit(buffer);  } -/** - * __trace_puts - write a constant string into the trace buffer. - * @ip:	   The address of the caller - * @str:   The constant string to write - * @size:  The size of the string. - */ -int __trace_puts(unsigned long ip, const char *str, int size) +int __trace_array_puts(struct trace_array *tr, unsigned long ip, +		       const char *str, int size)  {  	struct ring_buffer_event *event;  	struct trace_buffer *buffer; @@ -1015,7 +1048,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)  	unsigned int trace_ctx;  	int alloc; -	if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) +	if (!(tr->trace_flags & TRACE_ITER_PRINTK))  		return 0;  	if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -1024,7 +1057,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)  	alloc = sizeof(*entry) + size + 2; /* possible \n added */  	trace_ctx = tracing_gen_ctx(); -	buffer = global_trace.array_buffer.buffer; +	buffer = tr->array_buffer.buffer;  	ring_buffer_nest_start(buffer);  	event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,  					    trace_ctx); @@ -1046,11 +1079,23 @@ int __trace_puts(unsigned long ip, const char *str, int size)  		entry->buf[size] = '\0';  	__buffer_unlock_commit(buffer, event); -	ftrace_trace_stack(&global_trace, buffer, trace_ctx, 4, NULL); +	ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL);   out:  	ring_buffer_nest_end(buffer);  	return size;  } +EXPORT_SYMBOL_GPL(__trace_array_puts); + +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip:	   The address of the caller + * @str:   The constant string to write + * @size:  The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ +	return __trace_array_puts(&global_trace, ip, str, size); +}  EXPORT_SYMBOL_GPL(__trace_puts);  /** @@ -1142,7 +1187,7 @@ void tracing_snapshot_instance(struct trace_array *tr)   *   * Note, make sure to allocate the snapshot with either   * a tracing_snapshot_alloc(), or by doing it manually - * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * with: echo 1 > /sys/kernel/tracing/snapshot   *   * If the snapshot buffer is not allocated, it will stop tracing.   * Basically making a permanent snapshot. @@ -3128,6 +3173,9 @@ void __trace_stack(struct trace_array *tr, unsigned int trace_ctx,  		return;  	} +	if (WARN_ON_ONCE(IS_ENABLED(CONFIG_GENERIC_ENTRY))) +		return; +  	/*  	 * When an NMI triggers, RCU is enabled via ct_nmi_enter(),  	 * but if the above rcu_is_watching() failed, then the NMI @@ -5598,7 +5646,7 @@ static const char readme_msg[] =  #ifdef CONFIG_HIST_TRIGGERS  	"\t           s:[synthetic/]<event> <field> [<field>]\n"  #endif -	"\t           e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>]\n" +	"\t           e[:[<group>/][<event>]] <attached-group>.<attached-event> [<args>] [if <filter>]\n"  	"\t           -:[<group>/][<event>]\n"  #ifdef CONFIG_KPROBE_EVENTS  	"\t    place: [<module>:]<symbol>[+<offset>]|<memaddr>\n" @@ -5615,7 +5663,7 @@ static const char readme_msg[] =  	"\t           $stack<index>, $stack, $retval, $comm,\n"  #endif  	"\t           +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n" -	"\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, string, symbol,\n" +	"\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"  	"\t           b<bit-width>@<bit-offset>/<container-size>, ustring,\n"  	"\t           symstr, <type>\\[<array-size>\\]\n"  #ifdef CONFIG_HIST_TRIGGERS @@ -5757,7 +5805,7 @@ static const char readme_msg[] =  #ifdef CONFIG_SYNTH_EVENTS  	"  events/synthetic_events\t- Create/append/remove/show synthetic events\n"  	"\t  Write into this file to define/undefine new synthetic events.\n" -	"\t     example: echo 'myevent u64 lat; char name[]' >> synthetic_events\n" +	"\t     example: echo 'myevent u64 lat; char name[]; long[] stack' >> synthetic_events\n"  #endif  #endif  ; @@ -9148,9 +9196,6 @@ buffer_percent_write(struct file *filp, const char __user *ubuf,  	if (val > 100)  		return -EINVAL; -	if (!val) -		val = 1; -  	tr->buffer_percent = val;  	(*ppos)++; @@ -9225,10 +9270,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)  	}  	tr->allocated_snapshot = allocate_snapshot; -	/* -	 * Only the top level trace array gets its snapshot allocated -	 * from the kernel command line. -	 */  	allocate_snapshot = false;  #endif @@ -10144,6 +10185,79 @@ out:  	return ret;  } +#ifdef CONFIG_TRACER_MAX_TRACE +__init static bool tr_needs_alloc_snapshot(const char *name) +{ +	char *test; +	int len = strlen(name); +	bool ret; + +	if (!boot_snapshot_index) +		return false; + +	if (strncmp(name, boot_snapshot_info, len) == 0 && +	    boot_snapshot_info[len] == '\t') +		return true; + +	test = kmalloc(strlen(name) + 3, GFP_KERNEL); +	if (!test) +		return false; + +	sprintf(test, "\t%s\t", name); +	ret = strstr(boot_snapshot_info, test) == NULL; +	kfree(test); +	return ret; +} + +__init static void do_allocate_snapshot(const char *name) +{ +	if (!tr_needs_alloc_snapshot(name)) +		return; + +	/* +	 * When allocate_snapshot is set, the next call to +	 * allocate_trace_buffers() (called by trace_array_get_by_name()) +	 * will allocate the snapshot buffer. That will alse clear +	 * this flag. +	 */ +	allocate_snapshot = true; +} +#else +static inline void do_allocate_snapshot(const char *name) { } +#endif + +__init static void enable_instances(void) +{ +	struct trace_array *tr; +	char *curr_str; +	char *str; +	char *tok; + +	/* A tab is always appended */ +	boot_instance_info[boot_instance_index - 1] = '\0'; +	str = boot_instance_info; + +	while ((curr_str = strsep(&str, "\t"))) { + +		tok = strsep(&curr_str, ","); + +		if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE)) +			do_allocate_snapshot(tok); + +		tr = trace_array_get_by_name(tok); +		if (!tr) { +			pr_warn("Failed to create instance buffer %s\n", curr_str); +			continue; +		} +		/* Allow user space to delete it */ +		trace_array_put(tr); + +		while ((tok = strsep(&curr_str, ","))) { +			early_enable_events(tr, tok, true); +		} +	} +} +  __init static int tracer_alloc_buffers(void)  {  	int ring_buf_size; @@ -10277,10 +10391,19 @@ out:  void __init ftrace_boot_snapshot(void)  { +	struct trace_array *tr; +  	if (snapshot_at_boot) {  		tracing_snapshot();  		internal_trace_puts("** Boot snapshot taken **\n");  	} + +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr == &global_trace) +			continue; +		trace_array_puts(tr, "** Boot snapshot taken **\n"); +		tracing_snapshot_instance(tr); +	}  }  void __init early_trace_init(void) @@ -10295,11 +10418,16 @@ void __init early_trace_init(void)  			static_key_enable(&tracepoint_printk_key.key);  	}  	tracer_alloc_buffers(); + +	init_events();  }  void __init trace_init(void)  {  	trace_event_init(); + +	if (boot_instance_index) +		enable_instances();  }  __init static void clear_boot_tracer(void) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e46a49269be2..616e1aa1c4da 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -25,7 +25,7 @@  #include "pid_list.h"  #ifdef CONFIG_FTRACE_SYSCALLS -#include <asm/unistd.h>		/* For NR_SYSCALLS	     */ +#include <asm/unistd.h>		/* For NR_syscalls	     */  #include <asm/syscall.h>	/* some archs define it here */  #endif @@ -113,6 +113,10 @@ enum trace_type {  #define MEM_FAIL(condition, fmt, ...)					\  	DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) +#define HIST_STACKTRACE_DEPTH	16 +#define HIST_STACKTRACE_SIZE	(HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) +#define HIST_STACKTRACE_SKIP	5 +  /*   * syscalls are special, and need special handling, this is why   * they are not included in trace_entries.h @@ -1282,6 +1286,7 @@ struct ftrace_event_field {  	int			offset;  	int			size;  	int			is_signed; +	int			len;  };  struct prog_entry; @@ -1330,6 +1335,8 @@ DECLARE_PER_CPU(int, trace_buffered_event_cnt);  void trace_buffered_event_disable(void);  void trace_buffered_event_enable(void); +void early_enable_events(struct trace_array *tr, char *buf, bool disable_first); +  static inline void  __trace_event_discard_commit(struct trace_buffer *buffer,  			     struct ring_buffer_event *event) @@ -1490,6 +1497,7 @@ extern void trace_event_enable_cmd_record(bool enable);  extern void trace_event_enable_tgid_record(bool enable);  extern int event_trace_init(void); +extern int init_events(void);  extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);  extern int event_trace_del_tracer(struct trace_array *tr);  extern void __trace_early_add_events(struct trace_array *tr); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 352b65e2b910..67e854979d53 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -311,7 +311,7 @@ print_eprobe_event(struct trace_iterator *iter, int flags,  	trace_seq_putc(s, ')'); -	if (print_probe_args(s, tp->args, tp->nr_args, +	if (trace_probe_print_args(s, tp->args, tp->nr_args,  			     (u8 *)&field[1], field) < 0)  		goto out; @@ -320,7 +320,8 @@ print_eprobe_event(struct trace_iterator *iter, int flags,  	return trace_handle_return(s);  } -static unsigned long get_event_field(struct fetch_insn *code, void *rec) +static nokprobe_inline unsigned long +get_event_field(struct fetch_insn *code, void *rec)  {  	struct ftrace_event_field *field = code->data;  	unsigned long val; @@ -395,20 +396,12 @@ static int get_eprobe_size(struct trace_probe *tp, void *rec)  			case FETCH_OP_TP_ARG:  				val = get_event_field(code, rec);  				break; -			case FETCH_OP_IMM: -				val = code->immediate; -				break; -			case FETCH_OP_COMM: -				val = (unsigned long)current->comm; -				break; -			case FETCH_OP_DATA: -				val = (unsigned long)code->data; -				break;  			case FETCH_NOP_SYMBOL:	/* Ignore a place holder */  				code++;  				goto retry;  			default: -				continue; +				if (process_common_fetch_insn(code, &val) < 0) +					continue;  			}  			code++;  			len = process_fetch_insn_bottom(code, val, NULL, NULL); @@ -428,84 +421,26 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,  		   void *base)  {  	unsigned long val; +	int ret;   retry:  	switch (code->op) {  	case FETCH_OP_TP_ARG:  		val = get_event_field(code, rec);  		break; -	case FETCH_OP_IMM: -		val = code->immediate; -		break; -	case FETCH_OP_COMM: -		val = (unsigned long)current->comm; -		break; -	case FETCH_OP_DATA: -		val = (unsigned long)code->data; -		break;  	case FETCH_NOP_SYMBOL:	/* Ignore a place holder */  		code++;  		goto retry;  	default: -		return -EILSEQ; +		ret = process_common_fetch_insn(code, &val); +		if (ret < 0) +			return ret;  	}  	code++;  	return process_fetch_insn_bottom(code, val, dest, base);  }  NOKPROBE_SYMBOL(process_fetch_insn) -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen_user(unsigned long addr) -{ -	return kern_fetch_store_strlen_user(addr); -} - -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen(unsigned long addr) -{ -	return kern_fetch_store_strlen(addr); -} - -/* - * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf - * with max length and relative data location. - */ -static nokprobe_inline int -fetch_store_string_user(unsigned long addr, void *dest, void *base) -{ -	return kern_fetch_store_string_user(addr, dest, base); -} - -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max - * length and relative data location. - */ -static nokprobe_inline int -fetch_store_string(unsigned long addr, void *dest, void *base) -{ -	return kern_fetch_store_string(addr, dest, base); -} - -static nokprobe_inline int -probe_mem_read_user(void *dest, void *src, size_t size) -{ -	const void __user *uaddr =  (__force const void __user *)src; - -	return copy_from_user_nofault(dest, uaddr, size); -} - -static nokprobe_inline int -probe_mem_read(void *dest, void *src, size_t size) -{ -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE -	if ((unsigned long)src < TASK_SIZE) -		return probe_mem_read_user(dest, src, size); -#endif -	return copy_from_kernel_nofault(dest, src, size); -} -  /* eprobe handler */  static inline void  __eprobe_trace_func(struct eprobe_data *edata, void *rec) @@ -923,17 +858,13 @@ static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const ch  	p = ep->filter_str;  	for (i = 0; i < argc; i++) { -		ret = snprintf(p, len, "%s ", argv[i]); -		if (ret < 0) -			goto error; -		if (ret > len) { -			ret = -E2BIG; -			goto error; -		} +		if (i) +			ret = snprintf(p, len, " %s", argv[i]); +		else +			ret = snprintf(p, len, "%s", argv[i]);  		p += ret;  		len -= ret;  	} -	p[-1] = '\0';  	/*  	 * Ensure the filter string can be parsed correctly. Note, this diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 33e0b4f8ebe6..654ffa40457a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -114,7 +114,7 @@ trace_find_event_field(struct trace_event_call *call, char *name)  static int __trace_define_field(struct list_head *head, const char *type,  				const char *name, int offset, int size, -				int is_signed, int filter_type) +				int is_signed, int filter_type, int len)  {  	struct ftrace_event_field *field; @@ -133,6 +133,7 @@ static int __trace_define_field(struct list_head *head, const char *type,  	field->offset = offset;  	field->size = size;  	field->is_signed = is_signed; +	field->len = len;  	list_add(&field->link, head); @@ -150,14 +151,28 @@ int trace_define_field(struct trace_event_call *call, const char *type,  	head = trace_get_fields(call);  	return __trace_define_field(head, type, name, offset, size, -				    is_signed, filter_type); +				    is_signed, filter_type, 0);  }  EXPORT_SYMBOL_GPL(trace_define_field); +static int trace_define_field_ext(struct trace_event_call *call, const char *type, +		       const char *name, int offset, int size, int is_signed, +		       int filter_type, int len) +{ +	struct list_head *head; + +	if (WARN_ON(!call->class)) +		return 0; + +	head = trace_get_fields(call); +	return __trace_define_field(head, type, name, offset, size, +				    is_signed, filter_type, len); +} +  #define __generic_field(type, item, filter_type)			\  	ret = __trace_define_field(&ftrace_generic_fields, #type,	\  				   #item, 0, 0, is_signed_type(type),	\ -				   filter_type);			\ +				   filter_type, 0);			\  	if (ret)							\  		return ret; @@ -166,7 +181,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);  				   "common_" #item,			\  				   offsetof(typeof(ent), item),		\  				   sizeof(ent.item),			\ -				   is_signed_type(type), FILTER_OTHER);	\ +				   is_signed_type(type), FILTER_OTHER, 0);	\  	if (ret)							\  		return ret; @@ -1588,12 +1603,17 @@ static int f_show(struct seq_file *m, void *v)  		seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n",  			   field->type, field->name, field->offset,  			   field->size, !!field->is_signed); -	else -		seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", +	else if (field->len) +		seq_printf(m, "\tfield:%.*s %s[%d];\toffset:%u;\tsize:%u;\tsigned:%d;\n",  			   (int)(array_descriptor - field->type),  			   field->type, field->name, -			   array_descriptor, field->offset, +			   field->len, field->offset,  			   field->size, !!field->is_signed); +	else +		seq_printf(m, "\tfield:%.*s %s[];\toffset:%u;\tsize:%u;\tsigned:%d;\n", +				(int)(array_descriptor - field->type), +				field->type, field->name, +				field->offset, field->size, !!field->is_signed);  	return 0;  } @@ -2261,8 +2281,6 @@ create_new_subsystem(const char *name)  	if (!system->name)  		goto out_free; -	system->filter = NULL; -  	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);  	if (!system->filter)  		goto out_free; @@ -2379,9 +2397,10 @@ event_define_fields(struct trace_event_call *call)  			}  			offset = ALIGN(offset, field->align); -			ret = trace_define_field(call, field->type, field->name, +			ret = trace_define_field_ext(call, field->type, field->name,  						 offset, field->size, -						 field->is_signed, field->filter_type); +						 field->is_signed, field->filter_type, +						 field->len);  			if (WARN_ON_ONCE(ret)) {  				pr_err("error code is %d\n", ret);  				break; @@ -2822,7 +2841,7 @@ static __init int setup_trace_triggers(char *str)  		if (!trigger)  			break;  		bootup_triggers[i].event = strsep(&trigger, "."); -		bootup_triggers[i].trigger = strsep(&trigger, "."); +		bootup_triggers[i].trigger = trigger;  		if (!bootup_triggers[i].trigger)  			break;  	} @@ -3750,10 +3769,9 @@ static __init int event_trace_memsetup(void)  	return 0;  } -static __init void -early_enable_events(struct trace_array *tr, bool disable_first) +__init void +early_enable_events(struct trace_array *tr, char *buf, bool disable_first)  { -	char *buf = bootup_event_buf;  	char *token;  	int ret; @@ -3806,7 +3824,7 @@ static __init int event_trace_enable(void)  	 */  	__trace_early_add_events(tr); -	early_enable_events(tr, false); +	early_enable_events(tr, bootup_event_buf, false);  	trace_printk_start_comm(); @@ -3834,7 +3852,7 @@ static __init int event_trace_enable_again(void)  	if (!tr)  		return -ENODEV; -	early_enable_events(tr, true); +	early_enable_events(tr, bootup_event_buf, true);  	return 0;  } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 96acc2b71ac7..1dad64267878 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -64,6 +64,7 @@ enum filter_pred_fn {  	FILTER_PRED_FN_PCHAR_USER,  	FILTER_PRED_FN_PCHAR,  	FILTER_PRED_FN_CPU, +	FILTER_PRED_FN_FUNCTION,  	FILTER_PRED_FN_,  	FILTER_PRED_TEST_VISITED,  }; @@ -71,6 +72,7 @@ enum filter_pred_fn {  struct filter_pred {  	enum filter_pred_fn 	fn_num;  	u64 			val; +	u64 			val2;  	struct regex		regex;  	unsigned short		*ops;  	struct ftrace_event_field *field; @@ -103,6 +105,7 @@ struct filter_pred {  	C(INVALID_FILTER,	"Meaningless filter expression"),	\  	C(IP_FIELD_ONLY,	"Only 'ip' field is supported for function trace"), \  	C(INVALID_VALUE,	"Invalid value (did you forget quotes)?"), \ +	C(NO_FUNCTION,		"Function not found"),			\  	C(ERRNO,		"Error"),				\  	C(NO_FILTER,		"No filter found") @@ -128,7 +131,7 @@ static bool is_not(const char *str)  }  /** - * prog_entry - a singe entry in the filter program + * struct prog_entry - a singe entry in the filter program   * @target:	     Index to jump to on a branch (actually one minus the index)   * @when_to_branch:  The value of the result of the predicate to do a branch   * @pred:	     The predicate to execute. @@ -140,16 +143,16 @@ struct prog_entry {  };  /** - * update_preds- assign a program entry a label target + * update_preds - assign a program entry a label target   * @prog: The program array   * @N: The index of the current entry in @prog - * @when_to_branch: What to assign a program entry for its branch condition + * @invert: What to assign a program entry for its branch condition   *   * The program entry at @N has a target that points to the index of a program   * entry that can have its target and when_to_branch fields updated.   * Update the current program entry denoted by index @N target field to be   * that of the updated entry. This will denote the entry to update if - * we are processing an "||" after an "&&" + * we are processing an "||" after an "&&".   */  static void update_preds(struct prog_entry *prog, int N, int invert)  { @@ -876,6 +879,17 @@ static int filter_pred_comm(struct filter_pred *pred, void *event)  	return cmp ^ pred->not;  } +/* Filter predicate for functions. */ +static int filter_pred_function(struct filter_pred *pred, void *event) +{ +	unsigned long *addr = (unsigned long *)(event + pred->offset); +	unsigned long start = (unsigned long)pred->val; +	unsigned long end = (unsigned long)pred->val2; +	int ret = *addr >= start && *addr < end; + +	return pred->op == OP_EQ ? ret : !ret; +} +  /*   * regex_match_foo - Basic regex callbacks   * @@ -1335,6 +1349,8 @@ static int filter_pred_fn_call(struct filter_pred *pred, void *event)  		return filter_pred_pchar(pred, event);  	case FILTER_PRED_FN_CPU:  		return filter_pred_cpu(pred, event); +	case FILTER_PRED_FN_FUNCTION: +		return filter_pred_function(pred, event);  	case FILTER_PRED_TEST_VISITED:  		return test_pred_visited_fn(pred, event);  	default: @@ -1350,8 +1366,13 @@ static int parse_pred(const char *str, void *data,  	struct trace_event_call *call = data;  	struct ftrace_event_field *field;  	struct filter_pred *pred = NULL; +	unsigned long offset; +	unsigned long size; +	unsigned long ip;  	char num_buf[24];	/* Big enough to hold an address */  	char *field_name; +	char *name; +	bool function = false;  	bool ustring = false;  	char q;  	u64 val; @@ -1393,6 +1414,12 @@ static int parse_pred(const char *str, void *data,  		i += len;  	} +	/* See if the field is a kernel function name */ +	if ((len = str_has_prefix(str + i, ".function"))) { +		function = true; +		i += len; +	} +  	while (isspace(str[i]))  		i++; @@ -1423,7 +1450,71 @@ static int parse_pred(const char *str, void *data,  	pred->offset = field->offset;  	pred->op = op; -	if (ftrace_event_is_function(call)) { +	if (function) { +		/* The field must be the same size as long */ +		if (field->size != sizeof(long)) { +			parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i); +			goto err_free; +		} + +		/* Function only works with '==' or '!=' and an unquoted string */ +		switch (op) { +		case OP_NE: +		case OP_EQ: +			break; +		default: +			parse_error(pe, FILT_ERR_INVALID_OP, pos + i); +			goto err_free; +		} + +		if (isdigit(str[i])) { +			/* We allow 0xDEADBEEF */ +			while (isalnum(str[i])) +				i++; + +			len = i - s; +			/* 0xfeedfacedeadbeef is 18 chars max */ +			if (len >= sizeof(num_buf)) { +				parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); +				goto err_free; +			} + +			strncpy(num_buf, str + s, len); +			num_buf[len] = 0; + +			ret = kstrtoul(num_buf, 0, &ip); +			if (ret) { +				parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i); +				goto err_free; +			} +		} else { +			s = i; +			for (; str[i] && !isspace(str[i]); i++) +				; + +			len = i - s; +			name = kmemdup_nul(str + s, len, GFP_KERNEL); +			if (!name) +				goto err_mem; +			ip = kallsyms_lookup_name(name); +			kfree(name); +			if (!ip) { +				parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); +				goto err_free; +			} +		} + +		/* Now find the function start and end address */ +		if (!kallsyms_lookup_size_offset(ip, &size, &offset)) { +			parse_error(pe, FILT_ERR_NO_FUNCTION, pos + i); +			goto err_free; +		} + +		pred->fn_num = FILTER_PRED_FN_FUNCTION; +		pred->val = ip - offset; +		pred->val2 = pred->val + size; + +	} else if (ftrace_event_is_function(call)) {  		/*  		 * Perf does things different with function events.  		 * It only allows an "ip" field, and expects a string. diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fcaf226b7744..89877a18f933 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -135,6 +135,7 @@ enum hist_field_fn {  	HIST_FIELD_FN_DIV_NOT_POWER2,  	HIST_FIELD_FN_DIV_MULT_SHIFT,  	HIST_FIELD_FN_EXECNAME, +	HIST_FIELD_FN_STACK,  };  /* @@ -480,10 +481,6 @@ DEFINE_HIST_FIELD_FN(u8);  #define for_each_hist_key_field(i, hist_data)	\  	for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++) -#define HIST_STACKTRACE_DEPTH	16 -#define HIST_STACKTRACE_SIZE	(HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) -#define HIST_STACKTRACE_SKIP	5 -  #define HITCOUNT_IDX		0  #define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) @@ -1360,7 +1357,12 @@ static const char *hist_field_name(struct hist_field *field,  			field_name = field->name;  	} else if (field->flags & HIST_FIELD_FL_TIMESTAMP)  		field_name = "common_timestamp"; -	else if (field->flags & HIST_FIELD_FL_HITCOUNT) +	else if (field->flags & HIST_FIELD_FL_STACKTRACE) { +		if (field->field) +			field_name = field->field->name; +		else +			field_name = "stacktrace"; +	} else if (field->flags & HIST_FIELD_FL_HITCOUNT)  		field_name = "hitcount";  	if (field_name == NULL) @@ -1718,6 +1720,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)  		flags_str = "percent";  	else if (hist_field->flags & HIST_FIELD_FL_GRAPH)  		flags_str = "graph"; +	else if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) +		flags_str = "stacktrace";  	return flags_str;  } @@ -1979,7 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,  	}  	if (flags & HIST_FIELD_FL_STACKTRACE) { -		hist_field->fn_num = HIST_FIELD_FN_NOP; +		if (field) +			hist_field->fn_num = HIST_FIELD_FN_STACK; +		else +			hist_field->fn_num = HIST_FIELD_FN_NOP; +		hist_field->size = HIST_STACKTRACE_SIZE; +		hist_field->type = kstrdup_const("unsigned long[]", GFP_KERNEL); +		if (!hist_field->type) +			goto free;  		goto out;  	} @@ -1988,6 +1999,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,  		hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :  			HIST_FIELD_FN_BUCKET;  		hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); +		if (!hist_field->operands[0]) +			goto free;  		hist_field->size = hist_field->operands[0]->size;  		hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);  		if (!hist_field->type) @@ -2310,6 +2323,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,  			*flags |= HIST_FIELD_FL_EXECNAME;  		else if (strcmp(modifier, "syscall") == 0)  			*flags |= HIST_FIELD_FL_SYSCALL; +		else if (strcmp(modifier, "stacktrace") == 0) +			*flags |= HIST_FIELD_FL_STACKTRACE;  		else if (strcmp(modifier, "log2") == 0)  			*flags |= HIST_FIELD_FL_LOG2;  		else if (strcmp(modifier, "usecs") == 0) @@ -2349,6 +2364,8 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,  		hist_data->enable_timestamps = true;  		if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)  			hist_data->attrs->ts_in_usecs = true; +	} else if (strcmp(field_name, "stacktrace") == 0) { +		*flags |= HIST_FIELD_FL_STACKTRACE;  	} else if (strcmp(field_name, "common_cpu") == 0)  		*flags |= HIST_FIELD_FL_CPU;  	else if (strcmp(field_name, "hitcount") == 0) @@ -3109,6 +3126,9 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,  	unsigned int i, j, var_idx;  	u64 var_val; +	/* Make sure stacktrace can fit in the string variable length */ +	BUILD_BUG_ON((HIST_STACKTRACE_DEPTH + 1) * sizeof(long) >= STR_VAR_LEN_MAX); +  	for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {  		struct field_var *field_var = field_vars[i];  		struct hist_field *var = field_var->var; @@ -3117,13 +3137,26 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,  		var_val = hist_fn_call(val, elt, buffer, rbe, rec);  		var_idx = var->var.idx; -		if (val->flags & HIST_FIELD_FL_STRING) { +		if (val->flags & (HIST_FIELD_FL_STRING | +				  HIST_FIELD_FL_STACKTRACE)) {  			char *str = elt_data->field_var_str[j++];  			char *val_str = (char *)(uintptr_t)var_val;  			unsigned int size; -			size = min(val->size, STR_VAR_LEN_MAX); -			strscpy(str, val_str, size); +			if (val->flags & HIST_FIELD_FL_STRING) { +				size = min(val->size, STR_VAR_LEN_MAX); +				strscpy(str, val_str, size); +			} else { +				char *stack_start = str + sizeof(unsigned long); +				int e; + +				e = stack_trace_save((void *)stack_start, +						     HIST_STACKTRACE_DEPTH, +						     HIST_STACKTRACE_SKIP); +				if (e < HIST_STACKTRACE_DEPTH - 1) +					((unsigned long *)stack_start)[e] = 0; +				*((unsigned long *)str) = e; +			}  			var_val = (u64)(uintptr_t)str;  		}  		tracing_map_set_var(elt, var_idx, var_val); @@ -3822,7 +3855,8 @@ static void save_field_var(struct hist_trigger_data *hist_data,  {  	hist_data->field_vars[hist_data->n_field_vars++] = field_var; -	if (field_var->val->flags & HIST_FIELD_FL_STRING) +	/* Stack traces are saved in the string storage too */ +	if (field_var->val->flags & (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))  		hist_data->n_field_var_str++;  } @@ -3847,6 +3881,9 @@ static int check_synth_field(struct synth_event *event,  	    && field->is_dynamic)  		return 0; +	if (strstr(hist_field->type, "long[") && field->is_stack) +		return 0; +  	if (strcmp(field->type, hist_field->type) != 0) {  		if (field->size != hist_field->size ||  		    (!field->is_string && field->is_signed != hist_field->is_signed)) @@ -4101,7 +4138,8 @@ static int action_create(struct hist_trigger_data *hist_data,  			}  			hist_data->save_vars[hist_data->n_save_vars++] = field_var; -			if (field_var->val->flags & HIST_FIELD_FL_STRING) +			if (field_var->val->flags & +			    (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))  				hist_data->n_save_var_str++;  			kfree(param);  		} @@ -4240,6 +4278,19 @@ static u64 hist_field_execname(struct hist_field *hist_field,  	return (u64)(unsigned long)(elt_data->comm);  } +static u64 hist_field_stack(struct hist_field *hist_field, +			    struct tracing_map_elt *elt, +			    struct trace_buffer *buffer, +			    struct ring_buffer_event *rbe, +			    void *event) +{ +	u32 str_item = *(u32 *)(event + hist_field->field->offset); +	int str_loc = str_item & 0xffff; +	char *addr = (char *)(event + str_loc); + +	return (u64)(unsigned long)addr; +} +  static u64 hist_fn_call(struct hist_field *hist_field,  			struct tracing_map_elt *elt,  			struct trace_buffer *buffer, @@ -4303,6 +4354,8 @@ static u64 hist_fn_call(struct hist_field *hist_field,  		return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event);  	case HIST_FIELD_FN_EXECNAME:  		return hist_field_execname(hist_field, elt, buffer, rbe, event); +	case HIST_FIELD_FN_STACK: +		return hist_field_stack(hist_field, elt, buffer, rbe, event);  	default:  		return 0;  	} @@ -4349,7 +4402,8 @@ static int create_var_field(struct hist_trigger_data *hist_data,  	if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_EXECNAME)  		update_var_execname(hist_data->fields[val_idx]); -	if (!ret && hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING) +	if (!ret && hist_data->fields[val_idx]->flags & +	    (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE))  		hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++;  	return ret; @@ -5090,7 +5144,8 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,  		if (hist_field->flags & HIST_FIELD_FL_VAR) {  			var_idx = hist_field->var.idx; -			if (hist_field->flags & HIST_FIELD_FL_STRING) { +			if (hist_field->flags & +			    (HIST_FIELD_FL_STRING | HIST_FIELD_FL_STACKTRACE)) {  				unsigned int str_start, var_str_idx, idx;  				char *str, *val_str;  				unsigned int size; @@ -5103,9 +5158,20 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,  				str = elt_data->field_var_str[idx];  				val_str = (char *)(uintptr_t)hist_val; -				size = min(hist_field->size, STR_VAR_LEN_MAX); -				strscpy(str, val_str, size); - +				if (hist_field->flags & HIST_FIELD_FL_STRING) { +					size = min(hist_field->size, STR_VAR_LEN_MAX); +					strscpy(str, val_str, size); +				} else { +					char *stack_start = str + sizeof(unsigned long); +					int e; + +					e = stack_trace_save((void *)stack_start, +							     HIST_STACKTRACE_DEPTH, +							     HIST_STACKTRACE_SKIP); +					if (e < HIST_STACKTRACE_DEPTH - 1) +						((unsigned long *)stack_start)[e] = 0; +					*((unsigned long *)str) = e; +				}  				hist_val = (u64)(uintptr_t)str;  			}  			tracing_map_set_var(elt, var_idx, hist_val); @@ -5191,8 +5257,17 @@ static void event_hist_trigger(struct event_trigger_data *data,  		if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {  			memset(entries, 0, HIST_STACKTRACE_SIZE); -			stack_trace_save(entries, HIST_STACKTRACE_DEPTH, -					 HIST_STACKTRACE_SKIP); +			if (key_field->field) { +				unsigned long *stack, n_entries; + +				field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); +				stack = (unsigned long *)(long)field_contents; +				n_entries = *stack; +				memcpy(entries, ++stack, n_entries * sizeof(unsigned long)); +			} else { +				stack_trace_save(entries, HIST_STACKTRACE_DEPTH, +						 HIST_STACKTRACE_SKIP); +			}  			key = entries;  		} else {  			field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); @@ -5295,7 +5370,10 @@ static void hist_trigger_print_key(struct seq_file *m,  			seq_printf(m, "%s: %-30s[%3llu]", field_name,  				   syscall_name, uval);  		} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { -			seq_puts(m, "stacktrace:\n"); +			if (key_field->field) +				seq_printf(m, "%s.stacktrace", key_field->field->name); +			else +				seq_puts(m, "stacktrace:\n");  			hist_trigger_stacktrace_print(m,  						      key + key_field->offset,  						      HIST_STACKTRACE_DEPTH); @@ -5840,7 +5918,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)  	if (hist_field->flags) {  		if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) && -		    !(hist_field->flags & HIST_FIELD_FL_EXPR)) { +		    !(hist_field->flags & HIST_FIELD_FL_EXPR) && +		    !(hist_field->flags & HIST_FIELD_FL_STACKTRACE)) {  			const char *flags = get_hist_field_flags(hist_field);  			if (flags) @@ -5873,9 +5952,12 @@ static int event_hist_trigger_print(struct seq_file *m,  		if (i > hist_data->n_vals)  			seq_puts(m, ","); -		if (field->flags & HIST_FIELD_FL_STACKTRACE) -			seq_puts(m, "stacktrace"); -		else +		if (field->flags & HIST_FIELD_FL_STACKTRACE) { +			if (field->field) +				seq_printf(m, "%s.stacktrace", field->field->name); +			else +				seq_puts(m, "stacktrace"); +		} else  			hist_field_print(m, field);  	} diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 67592eed0be8..46d0abb32d0f 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -173,6 +173,14 @@ static int synth_field_is_string(char *type)  	return false;  } +static int synth_field_is_stack(char *type) +{ +	if (strstr(type, "long[") != NULL) +		return true; + +	return false; +} +  static int synth_field_string_size(char *type)  {  	char buf[4], *end, *start; @@ -248,6 +256,8 @@ static int synth_field_size(char *type)  		size = sizeof(gfp_t);  	else if (synth_field_is_string(type))  		size = synth_field_string_size(type); +	else if (synth_field_is_stack(type)) +		size = 0;  	return size;  } @@ -292,6 +302,8 @@ static const char *synth_field_fmt(char *type)  		fmt = "%x";  	else if (synth_field_is_string(type))  		fmt = "%.*s"; +	else if (synth_field_is_stack(type)) +		fmt = "%s";  	return fmt;  } @@ -371,6 +383,23 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,  						 i == se->n_fields - 1 ? "" : " ");  				n_u64 += STR_VAR_LEN_MAX / sizeof(u64);  			} +		} else if (se->fields[i]->is_stack) { +			u32 offset, data_offset, len; +			unsigned long *p, *end; + +			offset = (u32)entry->fields[n_u64]; +			data_offset = offset & 0xffff; +			len = offset >> 16; + +			p = (void *)entry + data_offset; +			end = (void *)p + len - (sizeof(long) - 1); + +			trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name); + +			for (; *p && p < end; p++) +				trace_seq_printf(s, "=> %pS\n", (void *)*p); +			n_u64++; +  		} else {  			struct trace_print_flags __flags[] = {  			    __def_gfpflag_names, {-1, NULL} }; @@ -416,16 +445,15 @@ static unsigned int trace_string(struct synth_trace_event *entry,  	if (is_dynamic) {  		u32 data_offset; -		data_offset = offsetof(typeof(*entry), fields); -		data_offset += event->n_u64 * sizeof(u64); +		data_offset = struct_size(entry, fields, event->n_u64);  		data_offset += data_size; -		len = kern_fetch_store_strlen((unsigned long)str_val); +		len = fetch_store_strlen((unsigned long)str_val);  		data_offset |= len << 16;  		*(u32 *)&entry->fields[*n_u64] = data_offset; -		ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); +		ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry);  		(*n_u64)++;  	} else { @@ -447,6 +475,43 @@ static unsigned int trace_string(struct synth_trace_event *entry,  	return len;  } +static unsigned int trace_stack(struct synth_trace_event *entry, +				 struct synth_event *event, +				 long *stack, +				 unsigned int data_size, +				 unsigned int *n_u64) +{ +	unsigned int len; +	u32 data_offset; +	void *data_loc; + +	data_offset = struct_size(entry, fields, event->n_u64); +	data_offset += data_size; + +	for (len = 0; len < HIST_STACKTRACE_DEPTH; len++) { +		if (!stack[len]) +			break; +	} + +	/* Include the zero'd element if it fits */ +	if (len < HIST_STACKTRACE_DEPTH) +		len++; + +	len *= sizeof(long); + +	/* Find the dynamic section to copy the stack into. */ +	data_loc = (void *)entry + data_offset; +	memcpy(data_loc, stack, len); + +	/* Fill in the field that holds the offset/len combo */ +	data_offset |= len << 16; +	*(u32 *)&entry->fields[*n_u64] = data_offset; + +	(*n_u64)++; + +	return len; +} +  static notrace void trace_event_raw_event_synth(void *__data,  						u64 *var_ref_vals,  						unsigned int *var_ref_idx) @@ -473,7 +538,12 @@ static notrace void trace_event_raw_event_synth(void *__data,  		val_idx = var_ref_idx[field_pos];  		str_val = (char *)(long)var_ref_vals[val_idx]; -		len = kern_fetch_store_strlen((unsigned long)str_val); +		if (event->dynamic_fields[i]->is_stack) { +			len = *((unsigned long *)str_val); +			len *= sizeof(unsigned long); +		} else { +			len = fetch_store_strlen((unsigned long)str_val); +		}  		fields_size += len;  	} @@ -499,6 +569,12 @@ static notrace void trace_event_raw_event_synth(void *__data,  					   event->fields[i]->is_dynamic,  					   data_size, &n_u64);  			data_size += len; /* only dynamic string increments */ +		} else if (event->fields[i]->is_stack) { +			long *stack = (long *)(long)var_ref_vals[val_idx]; + +			len = trace_stack(entry, event, stack, +					   data_size, &n_u64); +			data_size += len;  		} else {  			struct synth_field *field = event->fields[i];  			u64 val = var_ref_vals[val_idx]; @@ -561,6 +637,9 @@ static int __set_synth_event_print_fmt(struct synth_event *event,  		    event->fields[i]->is_dynamic)  			pos += snprintf(buf + pos, LEN_OR_ZERO,  				", __get_str(%s)", event->fields[i]->name); +		else if (event->fields[i]->is_stack) +			pos += snprintf(buf + pos, LEN_OR_ZERO, +				", __get_stacktrace(%s)", event->fields[i]->name);  		else  			pos += snprintf(buf + pos, LEN_OR_ZERO,  					", REC->%s", event->fields[i]->name); @@ -697,7 +776,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv,  		ret = -EINVAL;  		goto free;  	} else if (size == 0) { -		if (synth_field_is_string(field->type)) { +		if (synth_field_is_string(field->type) || +		    synth_field_is_stack(field->type)) {  			char *type;  			len = sizeof("__data_loc ") + strlen(field->type) + 1; @@ -728,6 +808,8 @@ static struct synth_field *parse_synth_field(int argc, char **argv,  	if (synth_field_is_string(field->type))  		field->is_string = true; +	else if (synth_field_is_stack(field->type)) +		field->is_stack = true;  	field->is_signed = synth_field_signed(field->type);   out: diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d960f6b11b5e..58f3946081e2 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -111,7 +111,8 @@ static void __always_unused ____ftrace_check_##name(void)		\  #define __array(_type, _item, _len) {					\  	.type = #_type"["__stringify(_len)"]", .name = #_item,		\  	.size = sizeof(_type[_len]), .align = __alignof__(_type),	\ -	is_signed_type(_type), .filter_type = FILTER_OTHER }, +	is_signed_type(_type), .filter_type = FILTER_OTHER,			\ +	.len = _len },  #undef __array_desc  #define __array_desc(_type, _container, _item, _len) __array(_type, _item, _len) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ee77c8203bd5..59cda19a9033 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1218,60 +1218,6 @@ static const struct file_operations kprobe_profile_ops = {  	.release        = seq_release,  }; -/* Kprobe specific fetch functions */ - -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen_user(unsigned long addr) -{ -	return kern_fetch_store_strlen_user(addr); -} - -/* Return the length of string -- including null terminal byte */ -static nokprobe_inline int -fetch_store_strlen(unsigned long addr) -{ -	return kern_fetch_store_strlen(addr); -} - -/* - * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf - * with max length and relative data location. - */ -static nokprobe_inline int -fetch_store_string_user(unsigned long addr, void *dest, void *base) -{ -	return kern_fetch_store_string_user(addr, dest, base); -} - -/* - * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max - * length and relative data location. - */ -static nokprobe_inline int -fetch_store_string(unsigned long addr, void *dest, void *base) -{ -	return kern_fetch_store_string(addr, dest, base); -} - -static nokprobe_inline int -probe_mem_read_user(void *dest, void *src, size_t size) -{ -	const void __user *uaddr =  (__force const void __user *)src; - -	return copy_from_user_nofault(dest, uaddr, size); -} - -static nokprobe_inline int -probe_mem_read(void *dest, void *src, size_t size) -{ -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE -	if ((unsigned long)src < TASK_SIZE) -		return probe_mem_read_user(dest, src, size); -#endif -	return copy_from_kernel_nofault(dest, src, size); -} -  /* Note that we don't verify it, since the code does not come from user space */  static int  process_fetch_insn(struct fetch_insn *code, void *rec, void *dest, @@ -1279,6 +1225,7 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,  {  	struct pt_regs *regs = rec;  	unsigned long val; +	int ret;  retry:  	/* 1st stage: get value from context */ @@ -1295,15 +1242,6 @@ retry:  	case FETCH_OP_RETVAL:  		val = regs_return_value(regs);  		break; -	case FETCH_OP_IMM: -		val = code->immediate; -		break; -	case FETCH_OP_COMM: -		val = (unsigned long)current->comm; -		break; -	case FETCH_OP_DATA: -		val = (unsigned long)code->data; -		break;  #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API  	case FETCH_OP_ARG:  		val = regs_get_kernel_argument(regs, code->param); @@ -1313,7 +1251,9 @@ retry:  		code++;  		goto retry;  	default: -		return -EILSEQ; +		ret = process_common_fetch_insn(code, &val); +		if (ret < 0) +			return ret;  	}  	code++; @@ -1424,7 +1364,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,  	trace_seq_putc(s, ')'); -	if (print_probe_args(s, tp->args, tp->nr_args, +	if (trace_probe_print_args(s, tp->args, tp->nr_args,  			     (u8 *)&field[1], field) < 0)  		goto out; @@ -1459,7 +1399,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,  	trace_seq_putc(s, ')'); -	if (print_probe_args(s, tp->args, tp->nr_args, +	if (trace_probe_print_args(s, tp->args, tp->nr_args,  			     (u8 *)&field[1], field) < 0)  		goto out; diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 94c1b5eb1dc0..04f0fdae19a1 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -147,9 +147,8 @@ static void osnoise_unregister_instance(struct trace_array *tr)  	 * register/unregister serialization is provided by trace's  	 * trace_types_lock.  	 */ -	lockdep_assert_held(&trace_types_lock); - -	list_for_each_entry_rcu(inst, &osnoise_instances, list) { +	list_for_each_entry_rcu(inst, &osnoise_instances, list, +				lockdep_is_held(&trace_types_lock)) {  		if (inst->tr == tr) {  			list_del_rcu(&inst->list);  			found = 1; @@ -1540,7 +1539,7 @@ static void osnoise_sleep(void)  	wake_time = ktime_add_us(ktime_get(), interval);  	__set_current_state(TASK_INTERRUPTIBLE); -	while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) { +	while (schedule_hrtimeout(&wake_time, HRTIMER_MODE_ABS)) {  		if (kthread_should_stop())  			break;  	} diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57a13b61f186..bd475a00f96d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1535,7 +1535,7 @@ static struct trace_event *events[] __initdata = {  	NULL  }; -__init static int init_events(void) +__init int init_events(void)  {  	struct trace_event *event;  	int i, ret; @@ -1548,4 +1548,3 @@ __init static int init_events(void)  	return 0;  } -early_initcall(init_events); diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 1e130da1b742..e37446f7916e 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -15,6 +15,20 @@  #define CREATE_TRACE_POINTS  #include <trace/events/preemptirq.h> +/* + * Use regular trace points on architectures that implement noinstr + * tooling: these calls will only happen with RCU enabled, which can + * use a regular tracepoint. + * + * On older architectures, use the rcuidle tracing methods (which + * aren't NMI-safe - so exclude NMI contexts): + */ +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#define trace(point)	trace_##point +#else +#define trace(point)	if (!in_nmi()) trace_##point##_rcuidle +#endif +  #ifdef CONFIG_TRACE_IRQFLAGS  /* Per-cpu variable to prevent redundant calls when IRQs already off */  static DEFINE_PER_CPU(int, tracing_irq_cpu); @@ -28,8 +42,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu);  void trace_hardirqs_on_prepare(void)  {  	if (this_cpu_read(tracing_irq_cpu)) { -		if (!in_nmi()) -			trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); +		trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);  		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);  		this_cpu_write(tracing_irq_cpu, 0);  	} @@ -40,8 +53,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare);  void trace_hardirqs_on(void)  {  	if (this_cpu_read(tracing_irq_cpu)) { -		if (!in_nmi()) -			trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); +		trace(irq_enable)(CALLER_ADDR0, CALLER_ADDR1);  		tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);  		this_cpu_write(tracing_irq_cpu, 0);  	} @@ -63,8 +75,7 @@ void trace_hardirqs_off_finish(void)  	if (!this_cpu_read(tracing_irq_cpu)) {  		this_cpu_write(tracing_irq_cpu, 1);  		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); -		if (!in_nmi()) -			trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); +		trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);  	}  } @@ -78,56 +89,24 @@ void trace_hardirqs_off(void)  	if (!this_cpu_read(tracing_irq_cpu)) {  		this_cpu_write(tracing_irq_cpu, 1);  		tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); -		if (!in_nmi()) -			trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); +		trace(irq_disable)(CALLER_ADDR0, CALLER_ADDR1);  	}  }  EXPORT_SYMBOL(trace_hardirqs_off);  NOKPROBE_SYMBOL(trace_hardirqs_off); - -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) -{ -	if (this_cpu_read(tracing_irq_cpu)) { -		if (!in_nmi()) -			trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); -		tracer_hardirqs_on(CALLER_ADDR0, caller_addr); -		this_cpu_write(tracing_irq_cpu, 0); -	} - -	lockdep_hardirqs_on_prepare(); -	lockdep_hardirqs_on(caller_addr); -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); -NOKPROBE_SYMBOL(trace_hardirqs_on_caller); - -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) -{ -	lockdep_hardirqs_off(caller_addr); - -	if (!this_cpu_read(tracing_irq_cpu)) { -		this_cpu_write(tracing_irq_cpu, 1); -		tracer_hardirqs_off(CALLER_ADDR0, caller_addr); -		if (!in_nmi()) -			trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); -	} -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); -NOKPROBE_SYMBOL(trace_hardirqs_off_caller);  #endif /* CONFIG_TRACE_IRQFLAGS */  #ifdef CONFIG_TRACE_PREEMPT_TOGGLE  void trace_preempt_on(unsigned long a0, unsigned long a1)  { -	if (!in_nmi()) -		trace_preempt_enable_rcuidle(a0, a1); +	trace(preempt_enable)(a0, a1);  	tracer_preempt_on(a0, a1);  }  void trace_preempt_off(unsigned long a0, unsigned long a1)  { -	if (!in_nmi()) -		trace_preempt_disable_rcuidle(a0, a1); +	trace(preempt_disable)(a0, a1);  	tracer_preempt_off(a0, a1);  }  #endif diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 01ebabbbe8c9..20d0c4a97633 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -50,6 +50,7 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(x8,  u8,  "0x%x")  DEFINE_BASIC_PRINT_TYPE_FUNC(x16, u16, "0x%x")  DEFINE_BASIC_PRINT_TYPE_FUNC(x32, u32, "0x%x")  DEFINE_BASIC_PRINT_TYPE_FUNC(x64, u64, "0x%Lx") +DEFINE_BASIC_PRINT_TYPE_FUNC(char, u8, "'%c'")  int PRINT_TYPE_FUNC_NAME(symbol)(struct trace_seq *s, void *data, void *ent)  { @@ -95,6 +96,7 @@ static const struct fetch_type probe_fetch_types[] = {  	ASSIGN_FETCH_TYPE_ALIAS(x16, u16, u16, 0),  	ASSIGN_FETCH_TYPE_ALIAS(x32, u32, u32, 0),  	ASSIGN_FETCH_TYPE_ALIAS(x64, u64, u64, 0), +	ASSIGN_FETCH_TYPE_ALIAS(char, u8, u8,  0),  	ASSIGN_FETCH_TYPE_ALIAS(symbol, ADDR_FETCH_TYPE, ADDR_FETCH_TYPE, 0),  	ASSIGN_FETCH_TYPE_END @@ -1237,3 +1239,30 @@ int trace_probe_create(const char *raw_command, int (*createfn)(int, const char  	return ret;  } + +int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args, +		 u8 *data, void *field) +{ +	void *p; +	int i, j; + +	for (i = 0; i < nr_args; i++) { +		struct probe_arg *a = args + i; + +		trace_seq_printf(s, " %s=", a->name); +		if (likely(!a->count)) { +			if (!a->type->print(s, data + a->offset, field)) +				return -ENOMEM; +			continue; +		} +		trace_seq_putc(s, '{'); +		p = data + a->offset; +		for (j = 0; j < a->count; j++) { +			if (!a->type->print(s, p, field)) +				return -ENOMEM; +			trace_seq_putc(s, j == a->count - 1 ? '}' : ','); +			p += a->type->size; +		} +	} +	return 0; +} diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 23acfd1c3812..ef8ed3b65d05 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -166,6 +166,7 @@ DECLARE_BASIC_PRINT_TYPE_FUNC(x16);  DECLARE_BASIC_PRINT_TYPE_FUNC(x32);  DECLARE_BASIC_PRINT_TYPE_FUNC(x64); +DECLARE_BASIC_PRINT_TYPE_FUNC(char);  DECLARE_BASIC_PRINT_TYPE_FUNC(string);  DECLARE_BASIC_PRINT_TYPE_FUNC(symbol); @@ -348,6 +349,8 @@ int trace_probe_compare_arg_type(struct trace_probe *a, struct trace_probe *b);  bool trace_probe_match_command_args(struct trace_probe *tp,  				    int argc, const char **argv);  int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **)); +int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args, +		 u8 *data, void *field);  #define trace_probe_for_each_link(pos, tp)	\  	list_for_each_entry(pos, &(tp)->event->files, list) diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index 77dbd9ff9782..c4e1d4c03a85 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -12,7 +12,7 @@   */  /* Return the length of string -- including null terminal byte */  static nokprobe_inline int -kern_fetch_store_strlen_user(unsigned long addr) +fetch_store_strlen_user(unsigned long addr)  {  	const void __user *uaddr =  (__force const void __user *)addr;  	int ret; @@ -29,14 +29,14 @@ kern_fetch_store_strlen_user(unsigned long addr)  /* Return the length of string -- including null terminal byte */  static nokprobe_inline int -kern_fetch_store_strlen(unsigned long addr) +fetch_store_strlen(unsigned long addr)  {  	int ret, len = 0;  	u8 c;  #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE  	if (addr < TASK_SIZE) -		return kern_fetch_store_strlen_user(addr); +		return fetch_store_strlen_user(addr);  #endif  	do { @@ -63,7 +63,7 @@ static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void   * with max length and relative data location.   */  static nokprobe_inline int -kern_fetch_store_string_user(unsigned long addr, void *dest, void *base) +fetch_store_string_user(unsigned long addr, void *dest, void *base)  {  	const void __user *uaddr =  (__force const void __user *)addr;  	int maxlen = get_loc_len(*(u32 *)dest); @@ -86,7 +86,7 @@ kern_fetch_store_string_user(unsigned long addr, void *dest, void *base)   * length and relative data location.   */  static nokprobe_inline int -kern_fetch_store_string(unsigned long addr, void *dest, void *base) +fetch_store_string(unsigned long addr, void *dest, void *base)  {  	int maxlen = get_loc_len(*(u32 *)dest);  	void *__dest; @@ -94,7 +94,7 @@ kern_fetch_store_string(unsigned long addr, void *dest, void *base)  #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE  	if ((unsigned long)addr < TASK_SIZE) -		return kern_fetch_store_string_user(addr, dest, base); +		return fetch_store_string_user(addr, dest, base);  #endif  	if (unlikely(!maxlen)) @@ -112,4 +112,22 @@ kern_fetch_store_string(unsigned long addr, void *dest, void *base)  	return ret;  } +static nokprobe_inline int +probe_mem_read_user(void *dest, void *src, size_t size) +{ +	const void __user *uaddr =  (__force const void __user *)src; + +	return copy_from_user_nofault(dest, uaddr, size); +} + +static nokprobe_inline int +probe_mem_read(void *dest, void *src, size_t size) +{ +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE +	if ((unsigned long)src < TASK_SIZE) +		return probe_mem_read_user(dest, src, size); +#endif +	return copy_from_kernel_nofault(dest, src, size); +} +  #endif /* __TRACE_PROBE_KERNEL_H_ */ diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 5cea672243f6..00707630788d 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -98,6 +98,26 @@ fetch_store_symstring(unsigned long addr, void *dest, void *base)  	return sprint_symbol(__dest, addr);  } +/* common part of process_fetch_insn*/ +static nokprobe_inline int +process_common_fetch_insn(struct fetch_insn *code, unsigned long *val) +{ +	switch (code->op) { +	case FETCH_OP_IMM: +		*val = code->immediate; +		break; +	case FETCH_OP_COMM: +		*val = (unsigned long)current->comm; +		break; +	case FETCH_OP_DATA: +		*val = (unsigned long)code->data; +		break; +	default: +		return -EILSEQ; +	} +	return 0; +} +  /* From the 2nd stage, routine is same */  static nokprobe_inline int  process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val, @@ -253,31 +273,3 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec,  		}  	}  } - -static inline int -print_probe_args(struct trace_seq *s, struct probe_arg *args, int nr_args, -		 u8 *data, void *field) -{ -	void *p; -	int i, j; - -	for (i = 0; i < nr_args; i++) { -		struct probe_arg *a = args + i; - -		trace_seq_printf(s, " %s=", a->name); -		if (likely(!a->count)) { -			if (!a->type->print(s, data + a->offset, field)) -				return -ENOMEM; -			continue; -		} -		trace_seq_putc(s, '{'); -		p = data + a->offset; -		for (j = 0; j < a->count; j++) { -			if (!a->type->print(s, p, field)) -				return -ENOMEM; -			trace_seq_putc(s, j == a->count - 1 ? '}' : ','); -			p += a->type->size; -		} -	} -	return 0; -} diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 9c90b3a7dce2..e5e299260d0c 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -403,3 +403,26 @@ int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str,  	return 1;  }  EXPORT_SYMBOL(trace_seq_hex_dump); + +/* + * trace_seq_acquire - acquire seq buffer with size len + * @s: trace sequence descriptor + * @len: size of buffer to be acquired + * + * acquire buffer with size of @len from trace_seq for output usage, + * user can fill string into that buffer. + * + * Returns start address of acquired buffer. + * + * it allow multiple usage in one trace output function call. + */ +char *trace_seq_acquire(struct trace_seq *s, unsigned int len) +{ +	char *ret = trace_seq_buffer_ptr(s); + +	if (!WARN_ON_ONCE(seq_buf_buffer_left(&s->seq) < len)) +		seq_buf_commit(&s->seq, len); + +	return ret; +} +EXPORT_SYMBOL(trace_seq_acquire); diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h index b29595fe3ac5..43f6fb6078db 100644 --- a/kernel/trace/trace_synth.h +++ b/kernel/trace/trace_synth.h @@ -18,6 +18,7 @@ struct synth_field {  	bool is_signed;  	bool is_string;  	bool is_dynamic; +	bool is_stack;  };  struct synth_event { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8d64b6553aed..8b92e34ff0c8 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -220,6 +220,7 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,  {  	struct pt_regs *regs = rec;  	unsigned long val; +	int ret;  	/* 1st stage: get value from context */  	switch (code->op) { @@ -235,20 +236,16 @@ process_fetch_insn(struct fetch_insn *code, void *rec, void *dest,  	case FETCH_OP_RETVAL:  		val = regs_return_value(regs);  		break; -	case FETCH_OP_IMM: -		val = code->immediate; -		break;  	case FETCH_OP_COMM:  		val = FETCH_TOKEN_COMM;  		break; -	case FETCH_OP_DATA: -		val = (unsigned long)code->data; -		break;  	case FETCH_OP_FOFFS:  		val = translate_user_vaddr(code->immediate);  		break;  	default: -		return -EILSEQ; +		ret = process_common_fetch_insn(code, &val); +		if (ret < 0) +			return ret;  	}  	code++; @@ -1042,7 +1039,7 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e  		data = DATAOF_TRACE_ENTRY(entry, false);  	} -	if (print_probe_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0) +	if (trace_probe_print_args(s, tu->tp.args, tu->tp.nr_args, data, entry) < 0)  		goto out;  	trace_seq_putc(s, '\n'); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f23144af5743..8d1507dd0724 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -571,8 +571,8 @@ static void for_each_tracepoint_range(  bool trace_module_has_bad_taint(struct module *mod)  {  	return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) | -			       (1 << TAINT_UNSIGNED_MODULE) | -			       (1 << TAINT_TEST)); +				(1 << TAINT_UNSIGNED_MODULE) | (1 << TAINT_TEST) | +				(1 << TAINT_LIVEPATCH));  }  static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list); diff --git a/kernel/umh.c b/kernel/umh.c index 850631518665..60aa9e764a38 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -32,9 +32,6 @@  #include <trace/events/module.h> -#define CAP_BSET	(void *)1 -#define CAP_PI		(void *)2 -  static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;  static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;  static DEFINE_SPINLOCK(umh_sysctl_lock); @@ -438,21 +435,27 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)  	if (wait == UMH_NO_WAIT)	/* task has freed sub_info */  		goto unlock; -	if (wait & UMH_KILLABLE) -		state |= TASK_KILLABLE; -  	if (wait & UMH_FREEZABLE)  		state |= TASK_FREEZABLE; -	retval = wait_for_completion_state(&done, state); -	if (!retval) -		goto wait_done; -  	if (wait & UMH_KILLABLE) { +		retval = wait_for_completion_state(&done, state | TASK_KILLABLE); +		if (!retval) +			goto wait_done; +  		/* umh_complete() will see NULL and free sub_info */  		if (xchg(&sub_info->complete, NULL))  			goto unlock; + +		/* +		 * fallthrough; in case of -ERESTARTSYS now do uninterruptible +		 * wait_for_completion_state(). Since umh_complete() shall call +		 * complete() in a moment if xchg() above returned NULL, this +		 * uninterruptible wait_for_completion_state() will not block +		 * SIGKILL'ed processes for long. +		 */  	} +	wait_for_completion_state(&done, state);  wait_done:  	retval = sub_info->retval; @@ -495,9 +498,9 @@ static int proc_cap_handler(struct ctl_table *table, int write,  			 void *buffer, size_t *lenp, loff_t *ppos)  {  	struct ctl_table t; -	unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; -	kernel_cap_t new_cap; -	int err, i; +	unsigned long cap_array[2]; +	kernel_cap_t new_cap, *cap; +	int err;  	if (write && (!capable(CAP_SETPCAP) ||  		      !capable(CAP_SYS_MODULE))) @@ -506,16 +509,13 @@ static int proc_cap_handler(struct ctl_table *table, int write,  	/*  	 * convert from the global kernel_cap_t to the ulong array to print to  	 * userspace if this is a read. +	 * +	 * Legacy format: capabilities are exposed as two 32-bit values  	 */ +	cap = table->data;  	spin_lock(&umh_sysctl_lock); -	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)  { -		if (table->data == CAP_BSET) -			cap_array[i] = usermodehelper_bset.cap[i]; -		else if (table->data == CAP_PI) -			cap_array[i] = usermodehelper_inheritable.cap[i]; -		else -			BUG(); -	} +	cap_array[0] = (u32) cap->val; +	cap_array[1] = cap->val >> 32;  	spin_unlock(&umh_sysctl_lock);  	t = *table; @@ -529,22 +529,15 @@ static int proc_cap_handler(struct ctl_table *table, int write,  	if (err < 0)  		return err; -	/* -	 * convert from the sysctl array of ulongs to the kernel_cap_t -	 * internal representation -	 */ -	for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) -		new_cap.cap[i] = cap_array[i]; +	new_cap.val = (u32)cap_array[0]; +	new_cap.val += (u64)cap_array[1] << 32;  	/*  	 * Drop everything not in the new_cap (but don't add things)  	 */  	if (write) {  		spin_lock(&umh_sysctl_lock); -		if (table->data == CAP_BSET) -			usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); -		if (table->data == CAP_PI) -			usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); +		*cap = cap_intersect(*cap, new_cap);  		spin_unlock(&umh_sysctl_lock);  	} @@ -554,15 +547,15 @@ static int proc_cap_handler(struct ctl_table *table, int write,  struct ctl_table usermodehelper_table[] = {  	{  		.procname	= "bset", -		.data		= CAP_BSET, -		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), +		.data		= &usermodehelper_bset, +		.maxlen		= 2 * sizeof(unsigned long),  		.mode		= 0600,  		.proc_handler	= proc_cap_handler,  	},  	{  		.procname	= "inheritable", -		.data		= CAP_PI, -		.maxlen		= _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), +		.data		= &usermodehelper_inheritable, +		.maxlen		= 2 * sizeof(unsigned long),  		.mode		= 0600,  		.proc_handler	= proc_cap_handler,  	}, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 54211dbd516c..1d8e47bed3f1 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -229,7 +229,7 @@ void __put_user_ns(struct user_namespace *ns)  EXPORT_SYMBOL(__put_user_ns);  /** - * idmap_key struct holds the information necessary to find an idmapping in a + * struct idmap_key - holds the information necessary to find an idmapping in a   * sorted idmap array. It is passed to cmp_map_id() as first argument.   */  struct idmap_key { diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index a6f9bdd956c3..f10f403104e7 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -273,6 +273,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)  	if (ret < 0)  		goto error; +	ret = -ENOMEM;  	pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);  	if (!pages)  		goto error; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 07895deca271..b8b541caed48 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -169,7 +169,9 @@ struct worker_pool {  	struct list_head	idle_list;	/* L: list of idle workers */  	struct timer_list	idle_timer;	/* L: worker idle timeout */ -	struct timer_list	mayday_timer;	/* L: SOS timer for workers */ +	struct work_struct      idle_cull_work; /* L: worker idle cleanup */ + +	struct timer_list	mayday_timer;	  /* L: SOS timer for workers */  	/* a workers is either on busy_hash or idle_list, or the manager */  	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); @@ -177,6 +179,7 @@ struct worker_pool {  	struct worker		*manager;	/* L: purely informational */  	struct list_head	workers;	/* A: attached workers */ +	struct list_head        dying_workers;  /* A: workers about to die */  	struct completion	*detach_completion; /* all workers detached */  	struct ida		worker_ida;	/* worker IDs for task name */ @@ -326,7 +329,7 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);  static LIST_HEAD(workqueues);		/* PR: list of all workqueues */  static bool workqueue_freezing;		/* PL: have wqs started freezing? */ -/* PL: allowable cpus for unbound wqs and work items */ +/* PL&A: allowable cpus for unbound wqs and work items */  static cpumask_var_t wq_unbound_cpumask;  /* CPU where unbound work was last round robin scheduled from this CPU */ @@ -1433,9 +1436,13 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,  	lockdep_assert_irqs_disabled(); -	/* if draining, only works from the same workqueue are allowed */ -	if (unlikely(wq->flags & __WQ_DRAINING) && -	    WARN_ON_ONCE(!is_chained_work(wq))) +	/* +	 * For a draining wq, only works from the same workqueue are +	 * allowed. The __WQ_DESTROYING helps to spot the issue that +	 * queues a new work item to a wq after destroy_workqueue(wq). +	 */ +	if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) && +		     WARN_ON_ONCE(!is_chained_work(wq))))  		return;  	rcu_read_lock();  retry: @@ -1900,7 +1907,7 @@ static void worker_detach_from_pool(struct worker *worker)  	list_del(&worker->node);  	worker->pool = NULL; -	if (list_empty(&pool->workers)) +	if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))  		detach_completion = pool->detach_completion;  	mutex_unlock(&wq_pool_attach_mutex); @@ -1972,21 +1979,55 @@ fail:  	return NULL;  } +static void unbind_worker(struct worker *worker) +{ +	lockdep_assert_held(&wq_pool_attach_mutex); + +	kthread_set_per_cpu(worker->task, -1); +	if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) +		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); +	else +		WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); +} + +static void wake_dying_workers(struct list_head *cull_list) +{ +	struct worker *worker, *tmp; + +	list_for_each_entry_safe(worker, tmp, cull_list, entry) { +		list_del_init(&worker->entry); +		unbind_worker(worker); +		/* +		 * If the worker was somehow already running, then it had to be +		 * in pool->idle_list when set_worker_dying() happened or we +		 * wouldn't have gotten here. +		 * +		 * Thus, the worker must either have observed the WORKER_DIE +		 * flag, or have set its state to TASK_IDLE. Either way, the +		 * below will be observed by the worker and is safe to do +		 * outside of pool->lock. +		 */ +		wake_up_process(worker->task); +	} +} +  /** - * destroy_worker - destroy a workqueue worker + * set_worker_dying - Tag a worker for destruction   * @worker: worker to be destroyed + * @list: transfer worker away from its pool->idle_list and into list   * - * Destroy @worker and adjust @pool stats accordingly.  The worker should - * be idle. + * Tag @worker for destruction and adjust @pool stats accordingly.  The worker + * should be idle.   *   * CONTEXT:   * raw_spin_lock_irq(pool->lock).   */ -static void destroy_worker(struct worker *worker) +static void set_worker_dying(struct worker *worker, struct list_head *list)  {  	struct worker_pool *pool = worker->pool;  	lockdep_assert_held(&pool->lock); +	lockdep_assert_held(&wq_pool_attach_mutex);  	/* sanity check frenzy */  	if (WARN_ON(worker->current_work) || @@ -1997,34 +2038,94 @@ static void destroy_worker(struct worker *worker)  	pool->nr_workers--;  	pool->nr_idle--; -	list_del_init(&worker->entry);  	worker->flags |= WORKER_DIE; -	wake_up_process(worker->task); + +	list_move(&worker->entry, list); +	list_move(&worker->node, &pool->dying_workers);  } +/** + * idle_worker_timeout - check if some idle workers can now be deleted. + * @t: The pool's idle_timer that just expired + * + * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in + * worker_leave_idle(), as a worker flicking between idle and active while its + * pool is at the too_many_workers() tipping point would cause too much timer + * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let + * it expire and re-evaluate things from there. + */  static void idle_worker_timeout(struct timer_list *t)  {  	struct worker_pool *pool = from_timer(pool, t, idle_timer); +	bool do_cull = false; + +	if (work_pending(&pool->idle_cull_work)) +		return;  	raw_spin_lock_irq(&pool->lock); -	while (too_many_workers(pool)) { +	if (too_many_workers(pool)) {  		struct worker *worker;  		unsigned long expires;  		/* idle_list is kept in LIFO order, check the last one */  		worker = list_entry(pool->idle_list.prev, struct worker, entry);  		expires = worker->last_active + IDLE_WORKER_TIMEOUT; +		do_cull = !time_before(jiffies, expires); + +		if (!do_cull) +			mod_timer(&pool->idle_timer, expires); +	} +	raw_spin_unlock_irq(&pool->lock); + +	if (do_cull) +		queue_work(system_unbound_wq, &pool->idle_cull_work); +} + +/** + * idle_cull_fn - cull workers that have been idle for too long. + * @work: the pool's work for handling these idle workers + * + * This goes through a pool's idle workers and gets rid of those that have been + * idle for at least IDLE_WORKER_TIMEOUT seconds. + * + * We don't want to disturb isolated CPUs because of a pcpu kworker being + * culled, so this also resets worker affinity. This requires a sleepable + * context, hence the split between timer callback and work item. + */ +static void idle_cull_fn(struct work_struct *work) +{ +	struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); +	struct list_head cull_list; + +	INIT_LIST_HEAD(&cull_list); +	/* +	 * Grabbing wq_pool_attach_mutex here ensures an already-running worker +	 * cannot proceed beyong worker_detach_from_pool() in its self-destruct +	 * path. This is required as a previously-preempted worker could run after +	 * set_worker_dying() has happened but before wake_dying_workers() did. +	 */ +	mutex_lock(&wq_pool_attach_mutex); +	raw_spin_lock_irq(&pool->lock); + +	while (too_many_workers(pool)) { +		struct worker *worker; +		unsigned long expires; + +		worker = list_entry(pool->idle_list.prev, struct worker, entry); +		expires = worker->last_active + IDLE_WORKER_TIMEOUT;  		if (time_before(jiffies, expires)) {  			mod_timer(&pool->idle_timer, expires);  			break;  		} -		destroy_worker(worker); +		set_worker_dying(worker, &cull_list);  	}  	raw_spin_unlock_irq(&pool->lock); +	wake_dying_workers(&cull_list); +	mutex_unlock(&wq_pool_attach_mutex);  }  static void send_mayday(struct work_struct *work) @@ -2388,12 +2489,12 @@ woke_up:  	/* am I supposed to die? */  	if (unlikely(worker->flags & WORKER_DIE)) {  		raw_spin_unlock_irq(&pool->lock); -		WARN_ON_ONCE(!list_empty(&worker->entry));  		set_pf_worker(false);  		set_task_comm(worker->task, "kworker/dying");  		ida_free(&pool->worker_ida, worker->id);  		worker_detach_from_pool(worker); +		WARN_ON_ONCE(!list_empty(&worker->entry));  		kfree(worker);  		return 0;  	} @@ -3462,10 +3563,12 @@ static int init_worker_pool(struct worker_pool *pool)  	hash_init(pool->busy_hash);  	timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE); +	INIT_WORK(&pool->idle_cull_work, idle_cull_fn);  	timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);  	INIT_LIST_HEAD(&pool->workers); +	INIT_LIST_HEAD(&pool->dying_workers);  	ida_init(&pool->worker_ida);  	INIT_HLIST_NODE(&pool->hash_node); @@ -3540,18 +3643,6 @@ static void rcu_free_pool(struct rcu_head *rcu)  	kfree(pool);  } -/* This returns with the lock held on success (pool manager is inactive). */ -static bool wq_manager_inactive(struct worker_pool *pool) -{ -	raw_spin_lock_irq(&pool->lock); - -	if (pool->flags & POOL_MANAGER_ACTIVE) { -		raw_spin_unlock_irq(&pool->lock); -		return false; -	} -	return true; -} -  /**   * put_unbound_pool - put a worker_pool   * @pool: worker_pool to put @@ -3566,8 +3657,11 @@ static bool wq_manager_inactive(struct worker_pool *pool)  static void put_unbound_pool(struct worker_pool *pool)  {  	DECLARE_COMPLETION_ONSTACK(detach_completion); +	struct list_head cull_list;  	struct worker *worker; +	INIT_LIST_HEAD(&cull_list); +  	lockdep_assert_held(&wq_pool_mutex);  	if (--pool->refcnt) @@ -3587,20 +3681,38 @@ static void put_unbound_pool(struct worker_pool *pool)  	 * Become the manager and destroy all workers.  This prevents  	 * @pool's workers from blocking on attach_mutex.  We're the last  	 * manager and @pool gets freed with the flag set. -	 * Because of how wq_manager_inactive() works, we will hold the -	 * spinlock after a successful wait. +	 * +	 * Having a concurrent manager is quite unlikely to happen as we can +	 * only get here with +	 *   pwq->refcnt == pool->refcnt == 0 +	 * which implies no work queued to the pool, which implies no worker can +	 * become the manager. However a worker could have taken the role of +	 * manager before the refcnts dropped to 0, since maybe_create_worker() +	 * drops pool->lock  	 */ -	rcuwait_wait_event(&manager_wait, wq_manager_inactive(pool), -			   TASK_UNINTERRUPTIBLE); -	pool->flags |= POOL_MANAGER_ACTIVE; +	while (true) { +		rcuwait_wait_event(&manager_wait, +				   !(pool->flags & POOL_MANAGER_ACTIVE), +				   TASK_UNINTERRUPTIBLE); + +		mutex_lock(&wq_pool_attach_mutex); +		raw_spin_lock_irq(&pool->lock); +		if (!(pool->flags & POOL_MANAGER_ACTIVE)) { +			pool->flags |= POOL_MANAGER_ACTIVE; +			break; +		} +		raw_spin_unlock_irq(&pool->lock); +		mutex_unlock(&wq_pool_attach_mutex); +	}  	while ((worker = first_idle_worker(pool))) -		destroy_worker(worker); +		set_worker_dying(worker, &cull_list);  	WARN_ON(pool->nr_workers || pool->nr_idle);  	raw_spin_unlock_irq(&pool->lock); -	mutex_lock(&wq_pool_attach_mutex); -	if (!list_empty(&pool->workers)) +	wake_dying_workers(&cull_list); + +	if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))  		pool->detach_completion = &detach_completion;  	mutex_unlock(&wq_pool_attach_mutex); @@ -3609,6 +3721,7 @@ static void put_unbound_pool(struct worker_pool *pool)  	/* shut down the timers */  	del_timer_sync(&pool->idle_timer); +	cancel_work_sync(&pool->idle_cull_work);  	del_timer_sync(&pool->mayday_timer);  	/* RCU protected to allow dereferences from get_work_pool() */ @@ -3952,7 +4065,8 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)  /* allocate the attrs and pwqs for later installation */  static struct apply_wqattrs_ctx *  apply_wqattrs_prepare(struct workqueue_struct *wq, -		      const struct workqueue_attrs *attrs) +		      const struct workqueue_attrs *attrs, +		      const cpumask_var_t unbound_cpumask)  {  	struct apply_wqattrs_ctx *ctx;  	struct workqueue_attrs *new_attrs, *tmp_attrs; @@ -3968,14 +4082,15 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,  		goto out_free;  	/* -	 * Calculate the attrs of the default pwq. +	 * Calculate the attrs of the default pwq with unbound_cpumask +	 * which is wq_unbound_cpumask or to set to wq_unbound_cpumask.  	 * If the user configured cpumask doesn't overlap with the  	 * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.  	 */  	copy_workqueue_attrs(new_attrs, attrs); -	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask); +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask);  	if (unlikely(cpumask_empty(new_attrs->cpumask))) -		cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask); +		cpumask_copy(new_attrs->cpumask, unbound_cpumask);  	/*  	 * We may create multiple pwqs with differing cpumasks.  Make a @@ -4072,7 +4187,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,  		wq->flags &= ~__WQ_ORDERED;  	} -	ctx = apply_wqattrs_prepare(wq, attrs); +	ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);  	if (!ctx)  		return -ENOMEM; @@ -4414,6 +4529,11 @@ void destroy_workqueue(struct workqueue_struct *wq)  	 */  	workqueue_sysfs_unregister(wq); +	/* mark the workqueue destruction is in progress */ +	mutex_lock(&wq->mutex); +	wq->flags |= __WQ_DESTROYING; +	mutex_unlock(&wq->mutex); +  	/* drain it before proceeding with destruction */  	drain_workqueue(wq); @@ -4709,22 +4829,53 @@ static void pr_cont_pool_info(struct worker_pool *pool)  	pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);  } -static void pr_cont_work(bool comma, struct work_struct *work) +struct pr_cont_work_struct { +	bool comma; +	work_func_t func; +	long ctr; +}; + +static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp) +{ +	if (!pcwsp->ctr) +		goto out_record; +	if (func == pcwsp->func) { +		pcwsp->ctr++; +		return; +	} +	if (pcwsp->ctr == 1) +		pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func); +	else +		pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func); +	pcwsp->ctr = 0; +out_record: +	if ((long)func == -1L) +		return; +	pcwsp->comma = comma; +	pcwsp->func = func; +	pcwsp->ctr = 1; +} + +static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)  {  	if (work->func == wq_barrier_func) {  		struct wq_barrier *barr;  		barr = container_of(work, struct wq_barrier, work); +		pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);  		pr_cont("%s BAR(%d)", comma ? "," : "",  			task_pid_nr(barr->task));  	} else { -		pr_cont("%s %ps", comma ? "," : "", work->func); +		if (!comma) +			pr_cont_work_flush(comma, (work_func_t)-1, pcwsp); +		pr_cont_work_flush(comma, work->func, pcwsp);  	}  }  static void show_pwq(struct pool_workqueue *pwq)  { +	struct pr_cont_work_struct pcws = { .ctr = 0, };  	struct worker_pool *pool = pwq->pool;  	struct work_struct *work;  	struct worker *worker; @@ -4757,7 +4908,8 @@ static void show_pwq(struct pool_workqueue *pwq)  				worker->rescue_wq ? "(RESCUER)" : "",  				worker->current_func);  			list_for_each_entry(work, &worker->scheduled, entry) -				pr_cont_work(false, work); +				pr_cont_work(false, work, &pcws); +			pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);  			comma = true;  		}  		pr_cont("\n"); @@ -4777,9 +4929,10 @@ static void show_pwq(struct pool_workqueue *pwq)  			if (get_work_pwq(work) != pwq)  				continue; -			pr_cont_work(comma, work); +			pr_cont_work(comma, work, &pcws);  			comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);  		} +		pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);  		pr_cont("\n");  	} @@ -4788,9 +4941,10 @@ static void show_pwq(struct pool_workqueue *pwq)  		pr_info("    inactive:");  		list_for_each_entry(work, &pwq->inactive_works, entry) { -			pr_cont_work(comma, work); +			pr_cont_work(comma, work, &pcws);  			comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);  		} +		pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);  		pr_cont("\n");  	}  } @@ -5006,13 +5160,8 @@ static void unbind_workers(int cpu)  		raw_spin_unlock_irq(&pool->lock); -		for_each_pool_worker(worker, pool) { -			kthread_set_per_cpu(worker->task, -1); -			if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask)) -				WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0); -			else -				WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); -		} +		for_each_pool_worker(worker, pool) +			unbind_worker(worker);  		mutex_unlock(&wq_pool_attach_mutex);  	} @@ -5334,7 +5483,7 @@ out_unlock:  }  #endif /* CONFIG_FREEZER */ -static int workqueue_apply_unbound_cpumask(void) +static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)  {  	LIST_HEAD(ctxs);  	int ret = 0; @@ -5350,7 +5499,7 @@ static int workqueue_apply_unbound_cpumask(void)  		if (wq->flags & __WQ_ORDERED)  			continue; -		ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs); +		ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);  		if (!ctx) {  			ret = -ENOMEM;  			break; @@ -5365,6 +5514,11 @@ static int workqueue_apply_unbound_cpumask(void)  		apply_wqattrs_cleanup(ctx);  	} +	if (!ret) { +		mutex_lock(&wq_pool_attach_mutex); +		cpumask_copy(wq_unbound_cpumask, unbound_cpumask); +		mutex_unlock(&wq_pool_attach_mutex); +	}  	return ret;  } @@ -5383,7 +5537,6 @@ static int workqueue_apply_unbound_cpumask(void)  int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)  {  	int ret = -EINVAL; -	cpumask_var_t saved_cpumask;  	/*  	 * Not excluding isolated cpus on purpose. @@ -5397,23 +5550,8 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)  			goto out_unlock;  		} -		if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) { -			ret = -ENOMEM; -			goto out_unlock; -		} - -		/* save the old wq_unbound_cpumask. */ -		cpumask_copy(saved_cpumask, wq_unbound_cpumask); - -		/* update wq_unbound_cpumask at first and apply it to wqs. */ -		cpumask_copy(wq_unbound_cpumask, cpumask); -		ret = workqueue_apply_unbound_cpumask(); - -		/* restore the wq_unbound_cpumask when failed. */ -		if (ret < 0) -			cpumask_copy(wq_unbound_cpumask, saved_cpumask); +		ret = workqueue_apply_unbound_cpumask(cpumask); -		free_cpumask_var(saved_cpumask);  out_unlock:  		apply_wqattrs_unlock();  	}  | 
